In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
chunk = pd.read_csv('v2_dtypes_LT_2018.csv', delimiter = ',',chunksize=1000000)

leave = pd.concat(chunk)

In [3]:
leave.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,01-JAN-18 00:00:00,5972116,12,119,48030,48030,48030,48030,2693211
1,01-JAN-18 00:00:00,5966674,12,119,54001,54001,54001,54001,2693267
2,01-JAN-18 00:00:00,5959105,12,119,60001,60001,60001,60001,2693263
3,01-JAN-18 00:00:00,5966888,12,119,58801,58801,58801,58801,2693284
4,01-JAN-18 00:00:00,5965960,12,119,56401,56401,56401,56401,2693209


In [4]:
leave.shape

(116360526, 9)

In [5]:
leave.dtypes

DAYOFSERVICE       object
TRIPID              int64
PROGRNUMBER         int64
STOPPOINTID         int64
PLANNEDTIME_ARR     int64
PLANNEDTIME_DEP     int64
ACTUALTIME_ARR      int64
ACTUALTIME_DEP      int64
VEHICLEID           int64
dtype: object

<h3>Changing Data Types to make size of leave dataframe smaller</h3>

In [6]:
leave['TRIPID'] = leave['TRIPID'].astype('int32')

In [7]:
leave['PROGRNUMBER'] = leave['PROGRNUMBER'].astype('int16')

In [8]:
leave['STOPPOINTID'] = leave['STOPPOINTID'].astype('int16')

In [9]:
leave['PLANNEDTIME_ARR'] = leave['PLANNEDTIME_ARR'].astype('int32')

In [10]:
leave['PLANNEDTIME_DEP'] = leave['PLANNEDTIME_DEP'].astype('int32')

In [11]:
leave['ACTUALTIME_ARR'] = leave['PLANNEDTIME_DEP'].astype('int32')

In [12]:
leave['ACTUALTIME_DEP'] = leave['PLANNEDTIME_DEP'].astype('int32')

In [13]:
leave['VEHICLEID'] = leave['VEHICLEID'].astype('int32')

In [14]:
leave.dtypes

DAYOFSERVICE       object
TRIPID              int32
PROGRNUMBER         int16
STOPPOINTID         int16
PLANNEDTIME_ARR     int32
PLANNEDTIME_DEP     int32
ACTUALTIME_ARR      int32
ACTUALTIME_DEP      int32
VEHICLEID           int32
dtype: object

In [15]:
leave.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,01-JAN-18 00:00:00,5972116,12,119,48030,48030,48030,48030,2693211
1,01-JAN-18 00:00:00,5966674,12,119,54001,54001,54001,54001,2693267
2,01-JAN-18 00:00:00,5959105,12,119,60001,60001,60001,60001,2693263
3,01-JAN-18 00:00:00,5966888,12,119,58801,58801,58801,58801,2693284
4,01-JAN-18 00:00:00,5965960,12,119,56401,56401,56401,56401,2693209


## Sort the dataframe by Trip ID

In [16]:
leave_sort = leave.sort_values('TRIPID')

In [17]:
leave_sort.shape

(116360526, 9)

In [18]:
leave_sort.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
1580830,06-JAN-18 00:00:00,5955221,69,2861,42184,42184,42184,42184,2693254
1578400,06-JAN-18 00:00:00,5955221,65,2857,42001,42001,42001,42001,2693254
9777438,06-JAN-18 00:00:00,5955221,74,2865,42457,42457,42457,42457,2693254
1572261,06-JAN-18 00:00:00,5955221,52,1045,41336,41336,41336,41336,2693254
1578368,06-JAN-18 00:00:00,5955221,64,2856,41956,41956,41956,41956,2693254


## Splitting the Sorted Dataframe into Five Dataframes

In [19]:
leave1, leave2, leave3, leave4, leave5, leave6, leave7, leave8 = np.array_split(leave_sort, 8)

MemoryError: Unable to allocate 2.60 GiB for an array with shape (6, 116360526) and data type int32

In [None]:
leave1.shape

In [None]:
leave1.tail()

In [None]:
leave2.shape

In [None]:
leave2.head()

In [None]:
leave2.tail()

In [None]:
leave3.shape

In [None]:
leave3.head()

In [None]:
leave3.tail()

In [None]:
leave4.shape

In [None]:
leave4.head()

In [None]:
leave4.tail()

In [None]:
leave5.shape

In [None]:
leave5.head()

In [None]:
leave5.tail()

In [None]:
leave6.shape

In [None]:
leave6.head()

In [None]:
leave6.tail()

In [None]:
leave7.shape

In [None]:
leave7.head()

In [None]:
leave7.tail()

In [None]:
leave8.shape

In [None]:
leave8.head()

## Moving the Split Trip ID into the other Dataframes

<h3>1 to 2:</h3>

In [None]:
leave1.TRIPID.value_counts()[6392987]

In [None]:
leave2.TRIPID.value_counts()[6392987]

In [None]:
# Grabbing trip 6392987 from leave1
tripMove = leave1 [leave1['TRIPID'] == 6392987]

tripMove.shape

In [None]:
leave2

In [None]:
# Append trip dataframe to leave2
leave2 = leave2.append(tripMove)

In [None]:
leave2.shape

In [None]:
# Drop rows with tripMove from leave1
leave1.drop(leave1[leave1['TRIPID'] == 6392987].index, inplace = True)

In [None]:
leave1.shape

In [None]:
leave1.tail()

<h3>2 to 3:</h3>

In [None]:
leave2.TRIPID.value_counts()[6777634]

In [None]:
leave3.TRIPID.value_counts()[6777634]

In [None]:
# Grabbing trip 6777634 from leave3
tripMove = leave2 [leave2['TRIPID'] == 6777634]

tripMove.shape

In [None]:
leave3

In [None]:
# Append trip dataframe to leave3
leave3 = leave3.append(tripMove)

In [None]:
leave3.shape

In [None]:
# Drop rows with tripMove from leave2
leave2.drop(leave2[leave2['TRIPID'] == 6777634].index, inplace = True)

In [None]:
leave2.shape

In [None]:
leave2.tail()

<h3>3 to 4:</h3>

In [None]:
leave3.TRIPID.value_counts()[7318670]

In [None]:
leave4.TRIPID.value_counts()[7318670]

In [None]:
# Grabbing trip 7318670 from leave3
tripMove = leave3 [leave3['TRIPID'] == 7318670]

tripMove.shape

In [None]:
leave4

In [None]:
# Append trip dataframe to leave4
leave4 = leave4.append(tripMove)

In [None]:
leave4.shape

In [None]:
# Drop rows with tripMove from leave3
leave3.drop(leave3[leave3['TRIPID'] == 7318670].index, inplace = True)

In [None]:
leave3.shape

In [None]:
leave3.tail()

<h3>4 to 5:</h3>

In [None]:
leave4.TRIPID.value_counts()[8066551]

In [None]:
leave5.TRIPID.value_counts()[8066551]

In [None]:
# Grabbing trip 8066551 from leave4
tripMove = leave4 [leave4['TRIPID'] == 8066551]

tripMove.shape

In [None]:
leave5

In [None]:
# Append trip dataframe to leave5
leave5 = leave5.append(tripMove)

In [None]:
leave5.shape

In [None]:
# Drop rows with tripMove from leave4
leave4.drop(leave4[leave4['TRIPID'] == 8066551].index, inplace = True)

In [None]:
leave4.shape

In [None]:
leave4.tail()

<h3>5 to 6:</h3>

In [None]:
leave5.TRIPID.value_counts()[8066551]

In [None]:
leave6.TRIPID.value_counts()[8066551]

In [None]:
# Grabbing trip 8066551 from leave5
tripMove = leave5 [leave5['TRIPID'] == 8066551]

tripMove.shape

In [None]:
leave6

In [None]:
# Append trip dataframe to leave6
leave6 = leave6.append(tripMove)

In [None]:
leave6.shape

In [None]:
# Drop rows with tripMove from leave5
leave5.drop(leave5[leave5['TRIPID'] == 8066551].index, inplace = True)

In [None]:
leave5.shape

In [None]:
leave5.tail()

<h3>6 to 7:</h3>

In [None]:
leave6.TRIPID.value_counts()[8066551]

In [None]:
leave7.TRIPID.value_counts()[8066551]

In [None]:
# Grabbing trip 8066551 from leave6
tripMove = leave6 [leave6['TRIPID'] == 8066551]

tripMove.shape

In [None]:
leave7

In [None]:
# Append trip dataframe to leave7
leave7 = leave7.append(tripMove)

In [None]:
leave7.shape

In [None]:
# Drop rows with tripMove from leave6
leave6.drop(leave6[leave6['TRIPID'] == 8066551].index, inplace = True)

In [None]:
leave6.shape

In [None]:
leave6.tail()

<h3>7 to 8:</h3>

In [None]:
leave7.TRIPID.value_counts()[8066551]

In [None]:
leave8.TRIPID.value_counts()[8066551]

In [None]:
# Grabbing trip 8066551 from leave7
tripMove = leave7 [leave7['TRIPID'] == 8066551]

tripMove.shape

In [None]:
leave8

In [None]:
# Append trip dataframe to leave8
leave8 = leave8.append(tripMove)

In [None]:
leave8.shape

In [None]:
# Drop rows with tripMove from leave7
leave7.drop(leave7[leave7['TRIPID'] == 8066551].index, inplace = True)

In [None]:
leave7.shape

In [None]:
leave7.tail()

## Send Parts to CSVs

In [None]:
# Send leave1 to csv:
leave1.to_csv('v3_leave1_LT_2018.csv', index=False)

## Continued in 'v3a_leave1_data_prep.ipynb'

In [None]:
# Send leave2 to csv:
leave2.to_csv('v3_leave2_LT_2018.csv', index=False)

## Continued in 'v3a_leave2_data_prep.ipynb'

In [None]:
# Send leave3 to csv:
leave3.to_csv('v3_leave3_LT_2018.csv', index=False)

## Continued in 'v3a_leave3_data_prep.ipynb'

In [None]:
# Send leave4 to csv:
leave4.to_csv('v3_leave4_LT_2018.csv', index=False)

## Continued in 'v3a_leave4_data_prep.ipynb'

In [None]:
# Send leave5 to csv:
leave5.to_csv('v3_leave5_LT_2018.csv', index=False)

## Continued in 'v3a_leave5_data_prep.ipynb'