In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
chunk = pd.read_csv('v1a_dtypes_LT_2018.csv', delimiter = ',',chunksize=1000000)

leave = pd.concat(chunk)

In [3]:
leave.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,LASTUPDATE
0,01-JAN-18 00:00:00,5972116,12,119,48030,48030,48012,48012,2693211,08-JAN-18 17:21:10
1,01-JAN-18 00:00:00,5966674,12,119,54001,54001,54023,54023,2693267,08-JAN-18 17:21:10
2,01-JAN-18 00:00:00,5959105,12,119,60001,60001,59955,59955,2693263,08-JAN-18 17:21:10
3,01-JAN-18 00:00:00,5966888,12,119,58801,58801,58771,58771,2693284,08-JAN-18 17:21:10
4,01-JAN-18 00:00:00,5965960,12,119,56401,56401,56309,56323,2693209,08-JAN-18 17:21:10


In [4]:
leave.shape

(116360526, 10)

In [5]:
leave.dtypes

DAYOFSERVICE       object
TRIPID              int64
PROGRNUMBER         int64
STOPPOINTID         int64
PLANNEDTIME_ARR     int64
PLANNEDTIME_DEP     int64
ACTUALTIME_ARR      int64
ACTUALTIME_DEP      int64
VEHICLEID           int64
LASTUPDATE         object
dtype: object

<h3>Changing Data Types to make size of leave dataframe smaller</h3>

In [6]:
leave['TRIPID'] = leave['TRIPID'].astype('int32')

In [7]:
leave['PROGRNUMBER'] = leave['PROGRNUMBER'].astype('int16')

In [8]:
leave['STOPPOINTID'] = leave['STOPPOINTID'].astype('int16')

In [9]:
leave['PLANNEDTIME_ARR'] = leave['PLANNEDTIME_ARR'].astype('int32')

In [10]:
leave['PLANNEDTIME_DEP'] = leave['PLANNEDTIME_DEP'].astype('int32')

In [11]:
leave['ACTUALTIME_ARR'] = leave['PLANNEDTIME_DEP'].astype('int32')

In [12]:
leave['ACTUALTIME_DEP'] = leave['ACTUALTIME_DEP'].astype('int32')

In [13]:
leave['VEHICLEID'] = leave['VEHICLEID'].astype('int32')

In [14]:
leave.dtypes

DAYOFSERVICE       object
TRIPID              int32
PROGRNUMBER         int16
STOPPOINTID         int16
PLANNEDTIME_ARR     int32
PLANNEDTIME_DEP     int32
ACTUALTIME_ARR      int32
ACTUALTIME_DEP      int32
VEHICLEID           int32
LASTUPDATE         object
dtype: object

In [15]:
lstupdate = leave.LASTUPDATE.value_counts()

In [16]:
lstupdate.shape

(360,)

In [17]:
# Drop LASTUPDATE column, not usefull
leave.drop('LASTUPDATE', axis=1, inplace=True)

In [18]:
leave.head()

Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID
0,01-JAN-18 00:00:00,5972116,12,119,48030,48030,48030,48030,2693211
1,01-JAN-18 00:00:00,5966674,12,119,54001,54001,54001,54001,2693267
2,01-JAN-18 00:00:00,5959105,12,119,60001,60001,60001,60001,2693263
3,01-JAN-18 00:00:00,5966888,12,119,58801,58801,58801,58801,2693284
4,01-JAN-18 00:00:00,5965960,12,119,56401,56401,56401,56401,2693209


In [None]:
# Send df to csv:
leave.to_csv('v2_dtypes_LT_2018.csv', index=False)

In [19]:
dayWork = pd.DataFrame()

In [20]:
dayWork['DAYOFSERVICE'] = leave['DAYOFSERVICE']

In [21]:
dayWork.head()

Unnamed: 0,DAYOFSERVICE
0,01-JAN-18 00:00:00
1,01-JAN-18 00:00:00
2,01-JAN-18 00:00:00
3,01-JAN-18 00:00:00
4,01-JAN-18 00:00:00


In [22]:
dayWork.shape

(116360526, 1)

In [23]:
dayWork['DAYOFSERVICE'] = dayWork['DAYOFSERVICE'].astype('datetime64')

In [24]:
dayWork.head()

Unnamed: 0,DAYOFSERVICE
0,2018-01-01
1,2018-01-01
2,2018-01-01
3,2018-01-01
4,2018-01-01


In [25]:
dayWork['DAYOFSERVICE'] = dayWork['DAYOFSERVICE'].astype('string')

In [26]:
dayWork.head()

Unnamed: 0,DAYOFSERVICE
0,2018-01-01
1,2018-01-01
2,2018-01-01
3,2018-01-01
4,2018-01-01


<h2>Add Day, Month, Year Columns</h2>

In [None]:
# Split the DAYOFSERVICE Column to get Month, Day, & Year (year needs one more split)
dayWork[['xyear', 'month', 'day']] = dayWork.DAYOFSERVICE.str.split('-', expand=True)