# Dublin Bus Modelling 

In the notebook, first we will try Logistic Regression and evaluate the result 

In [None]:
#Import the required packages

#Import package pandas for data analysis
import pandas as pd

# Import package numpy for numeric computing
import numpy as np

# Import package matplotlib  for visualisation/plotting
import matplotlib.pyplot as plt

import matplotlib.patches as mpatches

#import package matplotlib for import plots to PDF
#from matplotlib.backends.backend_pdf import PdfPages


# Allows plots to appear directly in the notebook.
%matplotlib inline

#import package seaborn for visualisation
import seaborn as sns


In [None]:
#import sklearn for LogisticRegression
#from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score

#Import statsmodels package for training a linear regression model.
#import statsmodels.formula.api as sm


In [None]:
# Read the csv file to data frame 
%time df=pd.read_csv('Dublin_2012_Step2.csv')

In [None]:
df

In [None]:
#Drop index row 
df.drop(df[[0]],axis=1,inplace=True)

In [6]:
# Check how many rows we have after loaded the csv file (3169731, 13)
df.shape

(3169731, 13)

In [7]:
# Add hourOfDay to dataframe
df['datetime']=pd.to_datetime(df['datetime'])
df['HourOfDay']=df['datetime'].dt.hour

In [8]:
df.dtypes

Timestamp                      int64
Journey_Pattern_ID            object
Date                          object
Vehicle_Journey_ID             int64
Lon_WGS84                    float64
Lat_WGS84                    float64
Delay                          int64
Stop_ID                       object
At_Stop                        int64
Distance_Terminal            float64
datetime              datetime64[ns]
day_of_week                   object
Trip_Time                      int64
HourOfDay                      int64
dtype: object

In [9]:
df

Unnamed: 0,Timestamp,Journey_Pattern_ID,Date,Vehicle_Journey_ID,Lon_WGS84,Lat_WGS84,Delay,Stop_ID,At_Stop,Distance_Terminal,datetime,day_of_week,Trip_Time,HourOfDay
0,1352160000,00150001,2012-11-05,5826,-6.258584,53.340099,-361,4870,0,0.00,2012-11-06 00:00:00,Tuesday,0,0
1,1352160000,046A1002,2012-11-05,7267,-6.259093,53.345425,-1101,794,0,0.00,2012-11-06 00:00:00,Tuesday,0,0
2,1352160000,00140001,2012-11-05,6206,-6.257329,53.287521,-126,1047,0,0.00,2012-11-06 00:00:00,Tuesday,0,0
3,1352160002,041B0002,2012-11-05,61,-6.264167,53.453217,-623,3874,1,0.00,2012-11-06 00:00:02,Tuesday,0,0
4,1352160002,039A1002,2012-11-05,3795,-6.262447,53.346767,-532,1479,0,0.00,2012-11-06 00:00:02,Tuesday,0,0
5,1352160002,00650001,2012-11-05,4004,-6.594641,53.129776,-287,7283,0,0.00,2012-11-06 00:00:02,Tuesday,0,0
6,1352160002,040D1001,2012-11-05,2466,-6.258850,53.362499,-488,52,0,0.00,2012-11-06 00:00:02,Tuesday,0,0
7,1352160002,00111002,2012-11-05,5241,-6.230217,53.323002,-536,320,0,0.00,2012-11-06 00:00:02,Tuesday,0,0
8,1352160002,00311001,2012-11-05,2819,-6.241683,53.362484,-386,613,0,0.00,2012-11-06 00:00:02,Tuesday,0,0
9,1352160002,00270001,2012-11-05,4976,-6.290833,53.319332,0,2355,1,0.00,2012-11-06 00:00:02,Tuesday,0,0


In [10]:
#Find all the continuous features 

continuous_columns=df[['Lon_WGS84','Lat_WGS84','Delay','Distance_Terminal','Trip_Time']].columns

# Descriptive stats for continuous features
df[continuous_columns].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Lon_WGS84,3169731.0,-6.270168,0.081152,-6.615067,-6.309314,-6.264495,-6.228252,-6.053133
Lat_WGS84,3169731.0,53.343494,0.053058,53.070332,53.318523,53.34502,53.372772,53.606518
Delay,3169731.0,54.072913,421.422407,-8958.0,-153.0,13.0,227.0,15188.0
Distance_Terminal,3169731.0,10585.62807,7476.727585,0.0,4686.71,9470.0,15448.84,198057.59
Trip_Time,3169731.0,2077.140856,1400.227907,0.0,958.0,1834.0,2983.0,10792.0


In [11]:
#print("Unique Journey_Pattern_ID ", pd.unique(df.Journey_Pattern_ID.ravel()))

In [12]:
# df['day_of_week'] = df['day_of_week'].map({'Monday': 1, 'Tuesday': 2,'Wednesday':3,'Thursday':4,'Friday': 5,'Saturday':6,'Sunday':7})

In [11]:
df=df.loc[df['day_of_week']=='Monday']
df.shape

(503997, 14)

In [12]:
df.head()

Unnamed: 0,Timestamp,Journey_Pattern_ID,Date,Vehicle_Journey_ID,Lon_WGS84,Lat_WGS84,Delay,Stop_ID,At_Stop,Distance_Terminal,datetime,day_of_week,Trip_Time,HourOfDay
2665734,1352678402,00370001,2012-11-11,14536,-6.393299,53.370346,-94,4893,0,15748.64,2012-11-12 00:00:02,Monday,2481,0
2665735,1352678402,00271003,2012-11-11,16039,-6.261491,53.344337,-522,1358,0,15471.49,2012-11-12 00:00:02,Monday,2095,0
2665736,1352678402,041C0001,2012-11-11,13420,-6.238769,53.453152,-112,3708,1,13621.8,2012-11-12 00:00:02,Monday,2099,0
2665737,1352678402,00650003,2012-11-11,15515,-6.473654,53.217064,0,4012,0,24719.53,2012-11-12 00:00:02,Monday,2732,0
2665738,1352678402,01450001,2012-11-11,16801,-6.130641,53.242863,-539,3135,1,18120.79,2012-11-12 00:00:02,Monday,2434,0


In [13]:
df.reset_index(drop=True, inplace=True)

In [14]:
df.head()

Unnamed: 0,Timestamp,Journey_Pattern_ID,Date,Vehicle_Journey_ID,Lon_WGS84,Lat_WGS84,Delay,Stop_ID,At_Stop,Distance_Terminal,datetime,day_of_week,Trip_Time,HourOfDay
0,1352678402,00370001,2012-11-11,14536,-6.393299,53.370346,-94,4893,0,15748.64,2012-11-12 00:00:02,Monday,2481,0
1,1352678402,00271003,2012-11-11,16039,-6.261491,53.344337,-522,1358,0,15471.49,2012-11-12 00:00:02,Monday,2095,0
2,1352678402,041C0001,2012-11-11,13420,-6.238769,53.453152,-112,3708,1,13621.8,2012-11-12 00:00:02,Monday,2099,0
3,1352678402,00650003,2012-11-11,15515,-6.473654,53.217064,0,4012,0,24719.53,2012-11-12 00:00:02,Monday,2732,0
4,1352678402,01450001,2012-11-11,16801,-6.130641,53.242863,-539,3135,1,18120.79,2012-11-12 00:00:02,Monday,2434,0


In [14]:
#dummies = pd.get_dummies(df.Journey_Pattern_ID)
#df.join(dummies)

# Add Weather feature 

In [15]:
# Use Pandas to read weather data into data frame 
weather_columns=['Date','TimeOfDay','Cloud','Rain','Temp','Wind']
df_weather=pd.read_csv('weather.csv',names=weather_columns)
df_weather

Unnamed: 0,Date,TimeOfDay,Cloud,Rain,Temp,Wind
0,01/11/2012,night,3.60,0.00,1.30,7.00
1,01/11/2012,night,4.92,0.00,1.55,10.38
2,01/11/2012,night,4.52,0.05,2.97,13.00
3,01/11/2012,night,4.29,0.04,3.19,13.83
4,01/11/2012,night,4.59,0.03,3.44,14.34
5,01/11/2012,night,4.22,0.03,3.75,15.30
6,01/11/2012,night,4.31,0.02,4.06,15.36
7,01/11/2012,night,4.29,0.02,4.02,14.98
8,01/11/2012,night,4.40,0.02,3.86,14.28
9,01/11/2012,night,4.07,0.02,3.83,13.98


In [16]:
df['TimeOfDay']=df['datetime'].dt.hour//12
df['TimeOfDay'] = df['TimeOfDay'].map({1: 'pm', 0: 'am'})

## Predictive Modeling: Logistic Regression

### Train the model with scikit-learn

In [42]:
intercept = pd.DataFrame({'Intercept': np.ones(503997)})
intercept

Unnamed: 0,Intercept
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0


In [16]:
# Prepare the descriptive features
X = pd.concat([intercept, df[['Journey_Pattern_ID','Distance_Terminal','HourOfDay']]], axis=1)
y = df.Trip_Time 

print("Descriptive features:\n", X)
print("\nTarget feature:\n", y)

NameError: name 'intercept' is not defined

In [44]:
X

Unnamed: 0,Intercept,Journey_Pattern_ID,Distance_Terminal,HourOfDay
0,1.0,00370001,15748.64,0
1,1.0,00271003,15471.49,0
2,1.0,041C0001,13621.80,0
3,1.0,00650003,24719.53,0
4,1.0,01450001,18120.79,0
5,1.0,00390002,21626.81,0
6,1.0,00790001,9694.26,0
7,1.0,00400001,20631.60,0
8,1.0,00150001,21090.26,0
9,1.0,00401002,14920.31,0


In [20]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

for col in X.columns.values:
    if X[col].dtypes=='object':
        # Using whole data to form an exhaustive list of levels
        data=X[col]
        le.fit(data.values)
        X[col]=le.transform(X[col])
X

Unnamed: 0,Journey_Pattern_ID,Distance_Terminal
0,84,15748.64
1,70,15471.49
2,367,13621.80
3,144,24719.53
4,225,18120.79
5,93,21626.81
6,174,9694.26
7,95,20631.60
8,43,21090.26
9,99,14920.31


In [46]:
X.dtypes

Intercept             float64
Journey_Pattern_ID      int64
Distance_Terminal     float64
HourOfDay               int64
dtype: object

In [None]:
# Train a model using logistic regression from scikit-learn.
# Use only the descriptive feature Size.
%time logreg = LogisticRegression().fit(X, y)

In [None]:
Y

In [None]:
# Print the valuation of the model 
predictions = logreg.predict(X)
print("Accuracy: ", metrics.accuracy_score(y, predictions))
print("Confusion matrix: \n", metrics.confusion_matrix(y, predictions))
print("Classification report:\n ", metrics.classification_report(y, predictions))


In [24]:
from sklearn.linear_model import LinearRegression
lm_sk = LinearRegression()
lm_sk.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
predictions=lm_sk.predict(X)

In [31]:
predicted

array([ 2918.56373851,  2890.63025083,  2381.33312645, ...,  1669.36350286,
        2942.12572019,  1612.07125912])

In [None]:
accuracy_score(y,knn.predict(X_test_minmax))

# Train the model with scikit-learn Lear regression 

In [None]:
% matplotlib inline
df.plot(kind='scatter', x='Distance_Terminal', y='Trip_Time',  figsize=(5, 5))

In [15]:
# Prepare the descriptive features
X = pd.concat([df[['Journey_Pattern_ID','Distance_Terminal','HourOfDay']]], axis=1)
y = df.Trip_Time 

print("Descriptive features:\n", X)
print("\nTarget feature:\n", y)

Descriptive features:
        Journey_Pattern_ID  Distance_Terminal  HourOfDay
0                00370001           15748.64          0
1                00271003           15471.49          0
2                041C0001           13621.80          0
3                00650003           24719.53          0
4                01450001           18120.79          0
5                00390002           21626.81          0
6                00790001            9694.26          0
7                00400001           20631.60          0
8                00150001           21090.26          0
9                00401002           14920.31          0
10               041C1001           16856.07          0
11               00071001           12702.19          0
12               00180001           15058.84          0
13               00471001           16158.01          0
14               00391001           17580.77          0
15               00181001           14970.50          0
16               00671001

In [19]:
df_linear = pd.concat([X, y], axis=1)
%time lm = sm.ols(formula = "Trip_Time ~ Journey_Pattern_ID+Distance_Terminal + HourOfDay", data=df_linear).fit()

CPU times: user 4min 38s, sys: 20min 4s, total: 24min 43s
Wall time: 28min 5s


In [None]:
df.plot(kind='scatter', x='Distance_Terminal', y='Trip_Time')
plt.plot(y, lm_predictions, c='red', linewidth=2)

# Training the model with SVM : SVR(Support Vector Regression)

In [17]:
#Import SVM and assign to a model 
from sklearn import svm
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html
clf = svm.SVR()

In [18]:
# Prepare the descriptive features
X = pd.concat([df[['Journey_Pattern_ID','Distance_Terminal']]], axis=1)
y = df.Trip_Time 

print("Descriptive features:\n", X)
print("\nTarget feature:\n", y)

Descriptive features:
        Journey_Pattern_ID  Distance_Terminal
0                00370001           15748.64
1                00271003           15471.49
2                041C0001           13621.80
3                00650003           24719.53
4                01450001           18120.79
5                00390002           21626.81
6                00790001            9694.26
7                00400001           20631.60
8                00150001           21090.26
9                00401002           14920.31
10               041C1001           16856.07
11               00071001           12702.19
12               00180001           15058.84
13               00471001           16158.01
14               00391001           17580.77
15               00181001           14970.50
16               00671001           23819.81
17               00310003           17310.73
18               01450001           25255.69
19               00470001           15801.39
20               033B0001       

In [None]:
# Train the model 
%time clf.fit(X, y)

CPU times: user 2h 15min 8s, sys: 57.1 s, total: 2h 16min 5s
Wall time: 18h 56min 23s


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [None]:
clf.predict(X)

In [None]:
df.plot(kind='scatter', x='Distance_Terminal', y='Trip_Time')
plt.plot(y, lm_predictions, c='red', linewidth=2)