# Project Dublin Bus：Predict Travel Time for Dublin Bus 
## Modeling for one week's data to compare the result of two different models 

Linear Regression vs Random Forest: RF Regressor

In [2]:
# Import all the packages we need 
import pandas as pd
import numpy as np
import pickle

In [3]:
# Read data file to data frame 
%time df = pd.read_csv('dublin_2012_week1.csv',dtype={ 'Journey_Pattern_ID': object})
df.dtypes



CPU times: user 12.9 s, sys: 3.1 s, total: 16 s
Wall time: 17.9 s


Timestamp               int64
Line_ID                object
Direction               int64
Journey_Pattern_ID     object
Date                   object
Vehicle_Journey_ID      int64
Operator               object
Congestion              int64
Lon_WGS84             float64
Lat_WGS84             float64
Delay                   int64
Block_ID                int64
Vehicle_ID              int64
Stop_ID                object
At_Stop                 int64
Distance                int64
Trip_Time               int64
datetime               object
HourOfDay               int64
day_of_week            object
midweek                 int64
dtype: object

In [5]:
# Check the firt 5 rows 
df.head(3)

Unnamed: 0,Journey_Pattern_ID,Date,Vehicle_Journey_ID,Lon_WGS84,Lat_WGS84,Stop_ID,At_Stop,Distance,Trip_Time,datetime,HourOfDay,day_of_week,midweek
0,10001,2012-11-06,5425,-6.26195,53.391151,226,1,0,0,2012-11-06 10:15:06,10,Tuesday,0
1,10001,2012-11-06,5425,-6.262341,53.391186,226,0,26,330,2012-11-06 10:20:36,10,Tuesday,0
2,10001,2012-11-06,5425,-6.257599,53.391567,228,0,418,392,2012-11-06 10:21:38,10,Tuesday,0


In [6]:
# Check the data frame size 
df.shape

(3028113, 13)

## Modeling using Linear Regression with all data set 

In [7]:
#Import packages 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [8]:
# 
feature_cols = ['Distance','midweek','HourOfDay']
X = df[feature_cols]
y = df['Trip_Time']
X.columns

Index(['Distance', 'midweek', 'HourOfDay'], dtype='object')

In [9]:
y.shape

(3028113,)

In [10]:
degrees = [1,2]

#plt.figure(figsize=(14, 5))
for i in range(len(degrees)):
    polynomial_features = PolynomialFeatures(degree=degrees[i],
                                             include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    pipeline.fit(X, y)
    # MAE:  Mean Absolute Error 
    score=pipeline.score(X,y)
    print("The R2 score of the model is ",score )
    mae = abs(y-pipeline.predict(X)).mean()
    print("Mean Absolute Error of degree",degrees[i]," is ",mae)

The R2 score of the model is  0.785186342919
Mean Absolute Error of degree 1  is  465.24207293
The R2 score of the model is  0.803195856605
Mean Absolute Error of degree 2  is  448.044523944


In [11]:
polynomial_features = PolynomialFeatures(degree=4,include_bias=False)
linear_regression = LinearRegression()
pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
%time pipeline.fit(X, y)

#df.plot(kind='scatter', x='Distance', y='Trip_Time',label="Samples")
#plt.plot(X['Distance'], pipeline.predict(X), c='Blue', label="Model")

#plt.savefig('Linear_Reg_Poly.png')

CPU times: user 10.8 s, sys: 5.82 s, total: 16.6 s
Wall time: 17.1 s


Pipeline(steps=[('polynomial_features', PolynomialFeatures(degree=4, include_bias=False, interaction_only=False)), ('linear_regression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [12]:
pipeline.named_steps['linear_regression'].get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': 1, 'normalize': False}

In [13]:
pipeline.score(X,y)

0.81213979549023552

In [14]:
# MSE: Mean Squared Error
mse=((y-pipeline.predict(X))**2).mean()
print("\n Mean Squared Error",mse)


 Mean Squared Error 360855.044804


In [15]:
# MAE:  Mean Absolute Error 
mae = abs(y-pipeline.predict(X)).mean()
print("Mean Absolute Error ",mae)

Mean Absolute Error  432.160800263


In [16]:
#Pickle the model of linear regression for all data set 
pickle.dump(pipeline, open('linear_model.sav', 'wb'))

In [17]:
#pipeline.get_params()

# Train all data set with Random Forest 

In [18]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
#sklearn.ensemble.RandomForestRegressor
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.multioutput import MultiOutputRegressor
max_depth = 30
regr_rf = RandomForestRegressor(max_depth=max_depth,random_state=2)

In [19]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
#sklearn.ensemble.RandomForestRegressor
max_depth = 30
regr_rf = RandomForestRegressor(max_depth=max_depth,random_state=2)
# Prepare the descriptive features
X = pd.concat([df[['Distance','midweek','HourOfDay']]], axis=1)
y = df.Trip_Time 

#print("Descriptive features:\n", X)
#print("\nTarget feature:\n", y)

In [20]:
%time regr_rf.fit(X,y)

CPU times: user 1min 23s, sys: 1.04 s, total: 1min 24s
Wall time: 1min 25s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=2,
           verbose=0, warm_start=False)

In [21]:
regr_rf.get_params()

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': 30,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 2,
 'verbose': 0,
 'warm_start': False}

In [22]:
regr_rf.score(X,y)

0.87160365813087737

In [23]:
# MAE:  Mean Absolute Error 

# Mean Absolute Error  of linear:  253.761443277
mae = abs(y-regr_rf.predict(X)).mean()
print("Mean Absolute Error of RF ",mae)

Mean Absolute Error of RF  353.194979583


In [24]:
# Pickle the Random Forest model 
pickle.dump(regr_rf, open('rf_modle.sav', 'wb'))

In [25]:
y_regr_rf=regr_rf.predict(X)

In [26]:
#plt.plot(X['Distance'],y_regr_rf, color='red', lable='Random Forest')

#plt.scatter(X['Distance'], y, color='darkorange', label='data')



# Randow Forest Train all data with Journey Pattern ID

In [27]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
#sklearn.ensemble.RandomForestRegressor
max_depth = 30
regr_rf = RandomForestRegressor(max_depth=max_depth,random_state=2)
# Prepare the descriptive features
X = pd.concat([df[['Journey_Pattern_ID','Distance','midweek','HourOfDay']]], axis=1)
y = df.Trip_Time 

#print("Descriptive features:\n", X)
#print("\nTarget feature:\n", y)

In [28]:
X.dtypes

Journey_Pattern_ID    object
Distance               int64
midweek                int64
HourOfDay              int64
dtype: object

In [29]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

for col in X.columns.values:
    if X[col].dtypes=='object':
        # Using whole data to form an exhaustive list of levels
        data=X[col]
        le.fit(data.values)
        X[col]=le.transform(X[col])

In [30]:
%time regr_rf.fit(X,y)

CPU times: user 1min 54s, sys: 1.81 s, total: 1min 56s
Wall time: 1min 57s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=2,
           verbose=0, warm_start=False)

In [31]:
regr_rf.score(X,y)

0.98254118424886283

In [32]:
# MAE:  Mean Absolute Error 

# Mean Absolute Error  of linear:  253.761443277
mae = abs(y-regr_rf.predict(X)).mean()
print("Mean Absolute Error of RF ",mae)

Mean Absolute Error of RF  112.931756316


In [33]:
pickle.dump(regr_rf, open('rf_modle_journey.sav', 'wb'))

## Split  data to train and test data set  and do the modeling again 

In [34]:
# Import train_test_split to split the data set to train and test set 
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)



# Linear Regression 

In [35]:
# Use pipeline to 
polynomial_features = PolynomialFeatures(degree=2,include_bias=False)
linear_regression = LinearRegression()
pipeline_train = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
%time pipeline_train.fit(X_train, y_train)

CPU times: user 2.47 s, sys: 617 ms, total: 3.09 s
Wall time: 2.61 s


Pipeline(steps=[('polynomial_features', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)), ('linear_regression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [36]:
pipeline_train.score(X_train,y_train)

0.8160879335643636

In [37]:
pipeline_train.score(X_test,y_test)

0.81732111474322133

In [38]:
# MAE:  Mean Absolute Error  of Training Data 

mae = abs(y_train-pipeline_train.predict(X_train)).mean()
print("Mean Absolute Error of RF ",mae)

Mean Absolute Error of RF  435.127878962


In [39]:
# MAE:  Mean Absolute Error of Testing Data 
 
mae = abs(y_test-pipeline_train.predict(X_test)).mean()
print("Mean Absolute Error of RF ",mae)

Mean Absolute Error of RF  435.066194255


# Random Forest

In [40]:
max_depth = 30
regr_rf_Train = RandomForestRegressor(max_depth=max_depth,random_state=2)

In [41]:
%time regr_rf_Train.fit(X_train,y_train)

CPU times: user 1min 37s, sys: 1.55 s, total: 1min 39s
Wall time: 5min 42s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=2,
           verbose=0, warm_start=False)

In [42]:
regr_rf_Train.score(X_train,y_train)

0.98324152054448777

In [43]:
regr_rf_Train.score(X_test,y_test)

0.91927519515838307

In [44]:
# MAE:  Mean Absolute Error of train data set 

mae = abs(y_train-regr_rf_Train.predict(X_train)).mean()
print("Mean Absolute Error of RF ",mae)

Mean Absolute Error of RF  110.297612658


In [45]:
# MAE:  Mean Absolute Error of test data set 

mae = abs(y_test-regr_rf_Train.predict(X_test)).mean()
print("Mean Absolute Error of RF test data set  ",mae)

Mean Absolute Error of RF test data set   252.388175721


## Result Analysis 

In the comparison of two models, we found out that the random forest get a better accuracy then linear regression. 

While the model size of random forest is much bigger than the linear regression one, the model size of random forest training with Journey_Pattern_id feature is almost 2GB, it will be impossible to use it in the web application.

The linear regression modle's size is about 1KB and the mean absolute error of the model is around 7 mins. Not as good as the random forest one, while it is still reasonable to use. 
