In [1]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

# Allows plots to appear directly in the notebook.
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
chunk = pd.read_table('rt_leavetimes_DB_2018.txt', delimiter = ';',chunksize=1000000)

leave = pd.concat(chunk)

In [3]:
leave.head()

Unnamed: 0,DATASOURCE,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,PASSENGERS,PASSENGERSIN,PASSENGERSOUT,DISTANCE,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE,NOTE
0,DB,01-JAN-18 00:00:00,5972116,12,119,48030,48030,48012,48012,2693211,,,,,,,08-JAN-18 17:21:10,
1,DB,01-JAN-18 00:00:00,5966674,12,119,54001,54001,54023,54023,2693267,,,,,,,08-JAN-18 17:21:10,
2,DB,01-JAN-18 00:00:00,5959105,12,119,60001,60001,59955,59955,2693263,,,,,,,08-JAN-18 17:21:10,
3,DB,01-JAN-18 00:00:00,5966888,12,119,58801,58801,58771,58771,2693284,,,,,,,08-JAN-18 17:21:10,
4,DB,01-JAN-18 00:00:00,5965960,12,119,56401,56401,56309,56323,2693209,,,,,,,08-JAN-18 17:21:10,


In [4]:
leave.shape

(116949113, 18)

In [5]:
leave.dtypes

DATASOURCE          object
DAYOFSERVICE        object
TRIPID               int64
PROGRNUMBER          int64
STOPPOINTID          int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
VEHICLEID            int64
PASSENGERS         float64
PASSENGERSIN       float64
PASSENGERSOUT      float64
DISTANCE           float64
SUPPRESSED         float64
JUSTIFICATIONID    float64
LASTUPDATE          object
NOTE               float64
dtype: object

<h3>Changing Data Types to make size of leave smaller</h3>

In [6]:
leave['TRIPID'] = leave['TRIPID'].astype('int32')

In [7]:
leave['PROGRNUMBER'] = leave['PROGRNUMBER'].astype('int32')

In [8]:
leave['STOPPOINTID'] = leave['STOPPOINTID'].astype('int32')

In [9]:
leave['PLANNEDTIME_ARR'] = leave['PLANNEDTIME_ARR'].astype('int32')

In [10]:
leave['PLANNEDTIME_DEP'] = leave['PLANNEDTIME_DEP'].astype('int32')

In [11]:
leave['VEHICLEID'] = leave['VEHICLEID'].astype('int32')

In [12]:
leave['PASSENGERS'] = leave['PASSENGERS'].astype('float32')

In [13]:
leave['PASSENGERSIN'] = leave['PASSENGERSIN'].astype('float32')

In [14]:
leave['PASSENGERSOUT'] = leave['PASSENGERSOUT'].astype('float32')

In [15]:
leave['DISTANCE'] = leave['DISTANCE'].astype('float32')

In [16]:
leave['SUPPRESSED'] = leave['SUPPRESSED'].astype('float32')

In [17]:
leave['JUSTIFICATIONID'] = leave['JUSTIFICATIONID'].astype('float32')

In [18]:
leave['NOTE'] = leave['NOTE'].astype('float32')

In [19]:
# Send df to csv:
leave.to_csv('v0_dtypes_LT_2018.csv', index=False)

<h2>Logical Integrity</h2><br>
- Test 1: Check for Null Values by Column: <br>
&nbsp;- PASSENGERS: All values missing <br>
&nbsp;- PASSENGERSIN: All values missing <br>
&nbsp;- PASSENGERSOUT: All values missing <br>
&nbsp;- DISTANCE: All values missing <br>
&nbsp;- SUPPRESSED: 99.5% missing <br>
&nbsp;- JUSTIFICATIONID: 99.5% missing <br>
&nbsp;- NOTE: All values missing <br>

In [20]:
leave.isnull().sum()

DATASOURCE                 0
DAYOFSERVICE               0
TRIPID                     0
PROGRNUMBER                0
STOPPOINTID                0
PLANNEDTIME_ARR            0
PLANNEDTIME_DEP            0
ACTUALTIME_ARR             0
ACTUALTIME_DEP             0
VEHICLEID                  0
PASSENGERS         116949113
PASSENGERSIN       116949113
PASSENGERSOUT      116949113
DISTANCE           116949113
SUPPRESSED         116360453
JUSTIFICATIONID    116360526
LASTUPDATE                 0
NOTE               116949113
dtype: int64

In [21]:
# Drop columns: PASSENGERS, PASSENGERIN, PASSENGERSOUT, DISTANCE, NOTE

leave.drop('PASSENGERS', axis=1, inplace=True)

In [22]:
leave.drop('PASSENGERSIN', axis=1, inplace=True)

In [23]:
leave.drop('PASSENGERSOUT', axis=1, inplace=True)

In [24]:
leave.drop('DISTANCE', axis=1, inplace=True)

In [25]:
leave.drop('NOTE', axis=1, inplace=True)

In [26]:
leave.head()

Unnamed: 0,DATASOURCE,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE
0,DB,01-JAN-18 00:00:00,5972116,12,119,48030,48030,48012,48012,2693211,,,08-JAN-18 17:21:10
1,DB,01-JAN-18 00:00:00,5966674,12,119,54001,54001,54023,54023,2693267,,,08-JAN-18 17:21:10
2,DB,01-JAN-18 00:00:00,5959105,12,119,60001,60001,59955,59955,2693263,,,08-JAN-18 17:21:10
3,DB,01-JAN-18 00:00:00,5966888,12,119,58801,58801,58771,58771,2693284,,,08-JAN-18 17:21:10
4,DB,01-JAN-18 00:00:00,5965960,12,119,56401,56401,56309,56323,2693209,,,08-JAN-18 17:21:10


In [27]:
# Send df to csv:
leave.to_csv('v0a_dtypes_LT_2018.csv', index=False)

- Test 2: Check that no rows are suppressed: <br>
&nbsp;- 438942 failed (.38% suppressed), need to drop these rows. <br>

In [28]:
test_2 = leave.SUPPRESSED [leave.SUPPRESSED == 0]
print('Number of rows failing test: ', test_2.shape[0])

Number of rows failing test:  438942


In [29]:
# Drop rows that have been suppressed
leave.drop(leave[leave['SUPPRESSED'] == 1].index, inplace = True)

In [30]:
leave.shape

(116799395, 13)

In [31]:
# Send df to csv:
leave.to_csv('v0b_dtypes_LT_2018.csv', index=False)

## Continued on 'v1_leavetime_data_prep.ipynb'