In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz

import graphviz
from graphviz import Source

import warnings
warnings.filterwarnings('ignore')

import pymysql

from math import sqrt

ModuleNotFoundError: No module named 'graphviz'

In [None]:
localhost = "localhost"
user = 'MY_USER'
passwd = 'MY_PASSWORD'
db = 'dublin_bus_time'
port = 3306

conn = pymysql.connect(host=localhost, user=user,
        passwd=passwd, db=db,
        port=port)

sql = """SELECT b.date, b.weekday, b.StopPointID, b.PlannedTime_Arr, (b.ActualTime_Arr - b.PlannedTime_Arr) as ActualTime_Arr, b.temp, b.wind_speed, b.weather_main, b.morning_rush, b.evening_rush, b.bank_holiday 
            FROM dublin_bus_time.route_79 b
            where b.direction = '1'
            and b.StopPointID <> '326'"""
df = pd.read_sql_query(sql, conn, parse_dates=['date'])
    

In [3]:
df.head(5)

Unnamed: 0,date,weekday,StopPointID,PlannedTime_Arr,ActualTime_Arr,temp,wind_speed,weather_main,morning_rush,evening_rush,bank_holiday
0,2018-01-01,0,1443,39180,178,5.02,9.77,Clouds,N,N,Y
1,2018-01-01,0,1444,39180,229,5.02,9.77,Clouds,N,N,Y
2,2018-01-01,0,1445,39180,289,5.02,9.77,Clouds,N,N,Y
3,2018-01-01,0,2637,39180,387,5.02,9.77,Clouds,N,N,Y
4,2018-01-01,0,2638,39180,399,5.02,9.77,Clouds,N,N,Y


In [4]:
df.dtypes

date               datetime64[ns]
weekday                    object
StopPointID                object
PlannedTime_Arr             int64
ActualTime_Arr              int64
temp                      float64
wind_speed                float64
weather_main               object
morning_rush               object
evening_rush               object
bank_holiday               object
dtype: object

In [5]:
df.isna().any()

date               False
weekday            False
StopPointID        False
PlannedTime_Arr    False
ActualTime_Arr     False
temp                True
wind_speed          True
weather_main        True
morning_rush       False
evening_rush       False
bank_holiday       False
dtype: bool

In [6]:
#drop any rows with null values

df = df.dropna()

In [7]:
df = pd.get_dummies(df)

In [8]:
df.head(10)

Unnamed: 0,date,PlannedTime_Arr,ActualTime_Arr,temp,wind_speed,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,...,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,morning_rush_N,morning_rush_Y,evening_rush_N,evening_rush_Y,bank_holiday_N,bank_holiday_Y
0,2018-01-01,39180,178,5.02,9.77,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
1,2018-01-01,39180,229,5.02,9.77,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
2,2018-01-01,39180,289,5.02,9.77,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
3,2018-01-01,39180,387,5.02,9.77,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
4,2018-01-01,39180,399,5.02,9.77,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
5,2018-01-01,39180,484,6.01,12.35,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
6,2018-01-01,39180,534,6.01,12.35,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
7,2018-01-01,39180,606,6.01,12.35,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
8,2018-01-01,39180,681,6.01,12.35,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
9,2018-01-01,39180,741,6.01,12.35,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1


In [9]:
df.dtypes

date                    datetime64[ns]
PlannedTime_Arr                  int64
ActualTime_Arr                   int64
temp                           float64
wind_speed                     float64
weekday_0                        uint8
weekday_1                        uint8
weekday_2                        uint8
weekday_3                        uint8
weekday_4                        uint8
weekday_5                        uint8
weekday_6                        uint8
StopPointID_1443                 uint8
StopPointID_1444                 uint8
StopPointID_1445                 uint8
StopPointID_2637                 uint8
StopPointID_2638                 uint8
StopPointID_2640                 uint8
StopPointID_2641                 uint8
StopPointID_2643                 uint8
StopPointID_2644                 uint8
StopPointID_2645                 uint8
StopPointID_2646                 uint8
StopPointID_2647                 uint8
StopPointID_2648                 uint8
StopPointID_2649         

In [10]:
#split the data into train and test

train = df[df['date'] < '2018-10-01']
test = df[df['date'] >= '2018-10-01']

In [11]:
train.tail(5)

Unnamed: 0,date,PlannedTime_Arr,ActualTime_Arr,temp,wind_speed,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,...,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,morning_rush_N,morning_rush_Y,evening_rush_N,evening_rush_Y,bank_holiday_N,bank_holiday_Y
263150,2018-09-30,84600,1338,6.45,5.14,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
263151,2018-09-30,84600,1380,6.45,5.14,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
263152,2018-09-30,84600,1499,6.45,5.14,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
263153,2018-09-30,84600,1296,6.45,5.14,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
263154,2018-09-30,84600,251,6.45,5.14,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0


In [12]:
test.head(5)

Unnamed: 0,date,PlannedTime_Arr,ActualTime_Arr,temp,wind_speed,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,...,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,morning_rush_N,morning_rush_Y,evening_rush_N,evening_rush_Y,bank_holiday_N,bank_holiday_Y
263155,2018-10-01,45300,132,10.91,7.2,1,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
263156,2018-10-01,45300,186,10.91,7.2,1,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
263157,2018-10-01,45300,280,10.91,7.2,1,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
263158,2018-10-01,45300,494,10.91,7.2,1,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
263159,2018-10-01,45300,636,10.91,7.2,1,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0


Drop date from train and test

In [13]:
train = train.drop(['date'], 1)
test = test.drop(['date'], 1)

In [14]:
#Need to reset indexes for test

test.reset_index(drop=True, inplace=True)

In [15]:
X_train = train.drop(['ActualTime_Arr'], 1)
Y_train = train['ActualTime_Arr']
X_test = test.drop(['ActualTime_Arr'], 1)
Y_test = test['ActualTime_Arr']


In [16]:
X_test.head(5)

Unnamed: 0,PlannedTime_Arr,temp,wind_speed,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,morning_rush_N,morning_rush_Y,evening_rush_N,evening_rush_Y,bank_holiday_N,bank_holiday_Y
0,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
1,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
2,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
3,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
4,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0


In [17]:
Y_test.head(5)

0    132
1    186
2    280
3    494
4    636
Name: ActualTime_Arr, dtype: int64

Linear Regression:

In [18]:
multiple_linreg = LinearRegression().fit(X_train, Y_train)

In [19]:
list(zip(X_train.columns, multiple_linreg.coef_))

[('PlannedTime_Arr', -0.003246991333201197),
 ('temp', 5.750879969318618),
 ('wind_speed', 4.584504938969403),
 ('weekday_0', -18.35591058820844),
 ('weekday_1', 30.861459174296158),
 ('weekday_2', 49.44947260994013),
 ('weekday_3', 90.3298194628326),
 ('weekday_4', 103.47384436036526),
 ('weekday_5', -106.97526388587676),
 ('weekday_6', -148.78342113334224),
 ('StopPointID_1443', -1044.437639446601),
 ('StopPointID_1444', -978.8085251874954),
 ('StopPointID_1445', -903.7514134163619),
 ('StopPointID_2637', -725.6822556501279),
 ('StopPointID_2638', -665.0293919675099),
 ('StopPointID_2640', -480.20994767402107),
 ('StopPointID_2641', -430.4214864253797),
 ('StopPointID_2643', -354.1784364544414),
 ('StopPointID_2644', -294.06952939982733),
 ('StopPointID_2645', -227.23256663791977),
 ('StopPointID_2646', -191.7813940796984),
 ('StopPointID_2647', -131.52187247357563),
 ('StopPointID_2648', -75.52015204730338),
 ('StopPointID_2649', -32.02894579789212),
 ('StopPointID_2650', 27.0840158

In [20]:
# calculate the prediction
multiple_linreg_predictions_train = multiple_linreg.predict(X_train)*1.0


actual_vs_predicted_multiplelinreg = pd.concat([Y_train, pd.DataFrame(multiple_linreg_predictions_train, columns=['Predicted'])], axis=1)
print(actual_vs_predicted_multiplelinreg.head(100))

    ActualTime_Arr   Predicted
0            178.0  -62.130312
1            229.0    3.498802
2            289.0   78.555914
3            387.0  256.625072
4            399.0  317.277936
..             ...         ...
95           128.0  -49.662963
96           269.0  128.406195
97           309.0  189.059059
98           444.0  373.878503
99           503.0  423.666964

[100 rows x 2 columns]


In [21]:
# Some more evaluation metrics.
print("==================== Train Data =======================")
print("Mean Absolute Error: ", metrics.mean_absolute_error(Y_train, multiple_linreg_predictions_train))
print("Mean Squared Error: \n", metrics.mean_squared_error(Y_train, multiple_linreg_predictions_train))
print("Root mean Squared Error: \n", sqrt(metrics.mean_squared_error(Y_train, multiple_linreg_predictions_train)))
print("R2 score:\n ", metrics.r2_score(Y_train, multiple_linreg_predictions_train))
print("======================================================")

Mean Absolute Error:  215.07531265492673
Mean Squared Error: 
 99229.06075408362
Root mean Squared Error: 
 315.0064455754574
R2 score:
  0.7753937617659346


In [22]:
X_test.head(10)

Unnamed: 0,PlannedTime_Arr,temp,wind_speed,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,morning_rush_N,morning_rush_Y,evening_rush_N,evening_rush_Y,bank_holiday_N,bank_holiday_Y
0,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
1,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
2,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
3,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
4,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
5,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
6,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
7,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
8,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
9,45300,10.91,7.2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0


In [23]:
Y_test.head(10)

0     132
1     186
2     280
3     494
4     636
5     864
6     936
7    1012
8    1091
9    1169
Name: ActualTime_Arr, dtype: int64

In [24]:
# calculate the prediction
multiple_linreg_predictions_test = multiple_linreg.predict(X_test)

print("\nPredictions with multiple linear regression: \n")
actual_vs_predicted_multiplelinreg = pd.concat([Y_test, pd.DataFrame(multiple_linreg_predictions_test, columns=['Predicted'])], axis=1)
print(actual_vs_predicted_multiplelinreg.head(100))


Predictions with multiple linear regression: 

    ActualTime_Arr   Predicted
0              132  147.677330
1              186  213.306444
2              280  288.363556
3              494  466.432714
4              636  527.085577
..             ...         ...
95             564  449.241247
96             644  509.894111
97             821  694.713555
98             852  744.502016
99             914  820.745066

[100 rows x 2 columns]


In [25]:
# Some more evaluation metrics.
print("==================== Test Data =======================")
print("Mean Absolute Error: ", metrics.mean_absolute_error(Y_test, multiple_linreg_predictions_test))
print("Mean Squared Error: \n", metrics.mean_squared_error(Y_test, multiple_linreg_predictions_test))
print("Root mean Squared Error: \n", sqrt(metrics.mean_squared_error(Y_test, multiple_linreg_predictions_test)))
print("R2 score:\n ", metrics.r2_score(Y_test, multiple_linreg_predictions_test))
print("======================================================")

Mean Absolute Error:  237.21919213738923
Mean Squared Error: 
 119587.2219971485
Root mean Squared Error: 
 345.81385454771544
R2 score:
  0.7460667634313538


In [26]:
import pickle

In [27]:
with open('model.pkl', 'wb') as handle:
    pickle.dump(multiple_linreg, handle, pickle.HIGHEST_PROTOCOL)