In [2]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Allows plots to appear directly in the notebook.
%matplotlib inline

from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [3]:
# Read CSV files into Data Frames:
weather = pd.read_csv('cleaned_2018_OpenWeather.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
trips = pd.read_csv('trips_2018.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [4]:
weather.shape

(26280, 29)

In [5]:
weather.dtypes

dt                       int64
dt_iso                  object
timezonetext            object
timezone                 int64
city_name               object
lat                    float64
lon                    float64
temp                   float64
visibility             float64
dew_point              float64
feels_like             float64
temp_min               float64
temp_max               float64
pressure                 int64
sea_level              float64
grnd_level             float64
humidity                 int64
wind_speed             float64
wind_deg                 int64
wind_gust              float64
rain_1h                float64
rain_3h                float64
snow_1h                float64
snow_3h                float64
clouds_all               int64
weather_id               int64
weather_main            object
weather_description     object
weather_icon            object
dtype: object

In [6]:
trips.shape

(1048575, 16)

In [7]:
trips.dtypes

DATASOURCE          object
DAYOFSERVICE        object
TRIPID               int64
LINEID              object
ROUTEID             object
DIRECTION            int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR     float64
ACTUALTIME_DEP     float64
BASIN               object
TENDERLOT          float64
SUPPRESSED         float64
JUSTIFICATIONID    float64
LASTUPDATE          object
NOTE                object
dtype: object

In [8]:
weather.head()

Unnamed: 0,dt,dt_iso,timezonetext,timezone,city_name,lat,lon,temp,visibility,dew_point,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1483228800,1/1/2017 0:00,0000 UTC,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,...,,2.3,,,,75,501,Rain,moderate rain,10n
1,1483232400,1/1/2017 1:00,0000 UTC,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,...,,1.51,,,,75,501,Rain,moderate rain,10n
2,1483236000,1/1/2017 2:00,0000 UTC,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,...,,0.64,,,,75,500,Rain,light rain,10n
3,1483239600,1/1/2017 3:00,0000 UTC,0,Custom location,53.345035,-6.267261,4.39,9999.0,3.36,...,,0.17,,,,75,500,Rain,light rain,10n
4,1483243200,1/1/2017 4:00,0000 UTC,0,Custom location,53.345035,-6.267261,4.39,9999.0,2.42,...,,,,,,75,803,Clouds,broken clouds,04n


In [9]:
weather_new = weather.rename(columns={'dt_iso': 'DAYOFSERVICE'})

In [10]:
weather_new.head()

Unnamed: 0,dt,DAYOFSERVICE,timezonetext,timezone,city_name,lat,lon,temp,visibility,dew_point,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1483228800,1/1/2017 0:00,0000 UTC,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,...,,2.3,,,,75,501,Rain,moderate rain,10n
1,1483232400,1/1/2017 1:00,0000 UTC,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,...,,1.51,,,,75,501,Rain,moderate rain,10n
2,1483236000,1/1/2017 2:00,0000 UTC,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,...,,0.64,,,,75,500,Rain,light rain,10n
3,1483239600,1/1/2017 3:00,0000 UTC,0,Custom location,53.345035,-6.267261,4.39,9999.0,3.36,...,,0.17,,,,75,500,Rain,light rain,10n
4,1483243200,1/1/2017 4:00,0000 UTC,0,Custom location,53.345035,-6.267261,4.39,9999.0,2.42,...,,,,,,75,803,Clouds,broken clouds,04n


In [11]:
trips.head()

Unnamed: 0,DATASOURCE,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,BASIN,TENDERLOT,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE,NOTE
0,DB,2/7/2018 0:00,6253783,68,68_80,1,87245,84600,87524.0,84600.0,BasDef,,,,2/28/2018 12:05,",2967409,"
1,DB,2/7/2018 0:00,6262138,25B,25B_271,2,30517,26460,32752.0,,BasDef,,,,2/28/2018 12:05,",2580260,"
2,DB,2/7/2018 0:00,6254942,45A,45A_70,2,35512,32100,36329.0,32082.0,BasDef,,,,2/28/2018 12:05,",2448968,"
3,DB,2/7/2018 0:00,6259460,25A,25A_273,1,57261,54420,58463.0,54443.0,BasDef,,,,2/28/2018 12:05,",3094242,"
4,DB,2/7/2018 0:00,6253175,14,14_15,1,85383,81600,84682.0,81608.0,BasDef,,,,2/28/2018 12:05,",2526331,"


In [16]:
weather['dt_iso'] = weather['dt_iso'].astype('datetime64')
trips['DAYOFSERVICE'] = trips['DAYOFSERVICE'].astype('datetime64')
trips['ACTUALTIME_ARR'] = trips['ACTUALTIME_ARR'].astype('float')
trips['PLANNEDTIME_DEP'] = trips['PLANNEDTIME_DEP'].astype('float')

In [23]:
trips['planDep_time'] = (trips['PLANNEDTIME_DEP'] / 3600).round()

In [24]:
trips.head()

Unnamed: 0,DATASOURCE,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,BASIN,TENDERLOT,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE,NOTE,planDep_time
0,DB,2018-02-07,6253783,68,68_80,1,87245,84600.0,87524.0,84600.0,BasDef,,,,2/28/2018 12:05,",2967409,",24.0
1,DB,2018-02-07,6262138,25B,25B_271,2,30517,26460.0,32752.0,,BasDef,,,,2/28/2018 12:05,",2580260,",7.0
2,DB,2018-02-07,6254942,45A,45A_70,2,35512,32100.0,36329.0,32082.0,BasDef,,,,2/28/2018 12:05,",2448968,",9.0
3,DB,2018-02-07,6259460,25A,25A_273,1,57261,54420.0,58463.0,54443.0,BasDef,,,,2/28/2018 12:05,",3094242,",15.0
4,DB,2018-02-07,6253175,14,14_15,1,85383,81600.0,84682.0,81608.0,BasDef,,,,2/28/2018 12:05,",2526331,",23.0


In [25]:
trips['planDep_time'] = trips['planDep_time'].astype('int')

In [27]:
trips.head()

Unnamed: 0,DATASOURCE,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,BASIN,TENDERLOT,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE,NOTE,planDep_time
0,DB,2018-02-07,6253783,68,68_80,1,87245,84600.0,87524.0,84600.0,BasDef,,,,2/28/2018 12:05,",2967409,",24
1,DB,2018-02-07,6262138,25B,25B_271,2,30517,26460.0,32752.0,,BasDef,,,,2/28/2018 12:05,",2580260,",7
2,DB,2018-02-07,6254942,45A,45A_70,2,35512,32100.0,36329.0,32082.0,BasDef,,,,2/28/2018 12:05,",2448968,",9
3,DB,2018-02-07,6259460,25A,25A_273,1,57261,54420.0,58463.0,54443.0,BasDef,,,,2/28/2018 12:05,",3094242,",15
4,DB,2018-02-07,6253175,14,14_15,1,85383,81600.0,84682.0,81608.0,BasDef,,,,2/28/2018 12:05,",2526331,",23


In [11]:
tripWeather = pd.merge(trips, weather, how="left", on=["DAYOFSERVICE"])

KeyError: 'DAYOFSERVICE'

In [None]:
trips39A = trips[trips['LINEID'] == '39A']

In [None]:
trips39A.head()

In [None]:
trip39AWeather = pd.merge(trips39A, weather, how="left", on=["DAYOFSERVICE"])