In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
chunk = pd.read_csv('v0b_dtypes_LT_2018.csv', delimiter = ',',chunksize=1000000)

leave = pd.concat(chunk)

In [3]:
leave.head()

Unnamed: 0,DATASOURCE,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE
0,DB,01-JAN-18 00:00:00,5972116,12,119,48030,48030,48012,48012,2693211,,,08-JAN-18 17:21:10
1,DB,01-JAN-18 00:00:00,5966674,12,119,54001,54001,54023,54023,2693267,,,08-JAN-18 17:21:10
2,DB,01-JAN-18 00:00:00,5959105,12,119,60001,60001,59955,59955,2693263,,,08-JAN-18 17:21:10
3,DB,01-JAN-18 00:00:00,5966888,12,119,58801,58801,58771,58771,2693284,,,08-JAN-18 17:21:10
4,DB,01-JAN-18 00:00:00,5965960,12,119,56401,56401,56309,56323,2693209,,,08-JAN-18 17:21:10


In [4]:
leave.shape

(116799395, 13)

In [5]:
leave.dtypes

DATASOURCE          object
DAYOFSERVICE        object
TRIPID               int64
PROGRNUMBER          int64
STOPPOINTID          int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
VEHICLEID            int64
SUPPRESSED         float64
JUSTIFICATIONID    float64
LASTUPDATE          object
dtype: object

<h3>Changing Data Types to make size of leave dataframe smaller</h3>

In [6]:
leave['TRIPID'] = leave['TRIPID'].astype('int32')

In [7]:
leave['PROGRNUMBER'] = leave['PROGRNUMBER'].astype('int32')

In [8]:
leave['STOPPOINTID'] = leave['STOPPOINTID'].astype('int32')

In [9]:
leave['PLANNEDTIME_ARR'] = leave['PLANNEDTIME_ARR'].astype('int32')

In [10]:
leave['PLANNEDTIME_DEP'] = leave['PLANNEDTIME_DEP'].astype('int32')

In [11]:
leave['VEHICLEID'] = leave['VEHICLEID'].astype('int32')

In [13]:
leave['SUPPRESSED'] = leave['SUPPRESSED'].astype('float32')

In [14]:
leave['JUSTIFICATIONID'] = leave['JUSTIFICATIONID'].astype('float32')

In [15]:
leave.dtypes

DATASOURCE          object
DAYOFSERVICE        object
TRIPID               int32
PROGRNUMBER          int32
STOPPOINTID          int32
PLANNEDTIME_ARR      int32
PLANNEDTIME_DEP      int32
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
VEHICLEID            int32
SUPPRESSED         float32
JUSTIFICATIONID    float32
LASTUPDATE          object
dtype: object

- Test 3: Check that no rows have justification IDs: <br>
&nbsp;- 438869 failed (.38% with faults), need to drop these rows. <br>

In [16]:
test_3 = leave.JUSTIFICATIONID [leave.JUSTIFICATIONID.notnull()]
print('Number of rows failing test: ', test_3.shape[0])

Number of rows failing test:  438869


In [17]:
# Drop rows that have justification codes
leave.drop(leave[leave.JUSTIFICATIONID.notnull()].index, inplace = True)

In [18]:
leave.shape

(116360526, 13)

In [19]:
# Send df to csv:
leave.to_csv('v1_dtypes_LT_2018.csv', index=False)

- Test 4: Check that Datasource columns is not constant: <br>
&nbsp;- All rows failed. <br>

In [20]:
leave.DATASOURCE.value_counts()

DB    116360526
Name: DATASOURCE, dtype: int64

In [21]:
# Drop DATASOURCE column:

leave.drop('DATASOURCE', axis=1, inplace=True)

- Test 5: Check that SUPPRESSED column is not constant: <br>
&nbsp;- All rows failed (zero of NaN). <br>

In [23]:
leave.SUPPRESSED.value_counts()

0.0    73
Name: SUPPRESSED, dtype: int64

In [24]:
# Drop SUPPRESSED column:

leave.drop('SUPPRESSED', axis=1, inplace=True)

- Test 6: Check that JUSTIFICATIONID column is not constant: <br>
&nbsp;- All rows failed (NaN). <br>

In [26]:
leave.JUSTIFICATIONID.value_counts()

Series([], Name: JUSTIFICATIONID, dtype: int64)

In [27]:
# Drop JUSTIFICATIONID column:

leave.drop('JUSTIFICATIONID', axis=1, inplace=True)

In [29]:
leave.shape

(116360526, 10)

In [30]:
# Send df to csv:
leave.to_csv('v1a_dtypes_LT_2018.csv', index=False)

- Test 7: Check for PLANNEDTIME_ARR values are not negative: <br>
&nbsp;- All rows passed. <br>

In [31]:
test_7 = leave.PLANNEDTIME_ARR [leave.PLANNEDTIME_ARR < 0]
print('Number of rows failing test: ', test_7.shape[0])

Number of rows failing test:  0


- Test 8: Check for PLANNEDTIME_DEP values are not negative: <br>
&nbsp;- All rows passed. <br>

In [32]:
test_8 = leave.PLANNEDTIME_DEP [leave.PLANNEDTIME_DEP < 0]
print('Number of rows failing test: ', test_8.shape[0])

Number of rows failing test:  0


- Test 9: Check for ACTUALTIME_ARR values are not negative: <br>
&nbsp;- All rows passed. <br>

In [33]:
test_9 = leave.ACTUALTIME_ARR [leave.ACTUALTIME_ARR < 0]
print('Number of rows failing test: ', test_9.shape[0])

Number of rows failing test:  0


- Test 10: Check for ACTUALTIME_DEP values are not negative: <br>
&nbsp;- All rows passed. <br>

In [34]:
test_10 = leave.ACTUALTIME_DEP [leave.ACTUALTIME_DEP < 0]
print('Number of rows failing test: ', test_10.shape[0])

Number of rows failing test:  0


- Test 11: Check for ACTUALTIME_ARR values are not missing: <br>
&nbsp;- All rows passed. <br>

In [35]:
test_11 = leave.ACTUALTIME_ARR [leave.ACTUALTIME_ARR.isnull()]
print('Number of rows failing test: ', test_11.shape[0])

Number of rows failing test:  0


- Test 12: Check for ACTUALTIME_DEP values are not missing: <br>
&nbsp;- All rows passed. <br>

In [36]:
test_12 = leave.ACTUALTIME_DEP [leave.ACTUALTIME_DEP.isnull()]
print('Number of rows failing test: ', test_12.shape[0])

Number of rows failing test:  0


## Continued on 'v2_leavetime_data_prep.ipynb'