In [1]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess
from scipy import stats
from scipy.stats import linregress
import seaborn as sns
import datetime as dt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

from joblib import dump, load

sns.set()

In [2]:
data = pd.read_csv("clean_bike_data.csv")
data['DATETIME'] =  pd.to_datetime(data['DATETIME'], infer_datetime_format=True)
data = data[data["YEAR"] >= 2012]
data = data.dropna()
data

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,LAT,LON,DATETIME
12736,Theft of Bicycle,2012,1,1,12,0,22XX W BROADWAY AVE,Kitsilano,49.263854,-123.157311,2012-01-01 12:00:00
12737,Theft of Bicycle,2012,1,1,12,0,23XX W 5TH AVE,Kitsilano,49.267415,-123.159482,2012-01-01 12:00:00
12738,Theft of Bicycle,2012,1,1,16,59,7XX JACKSON AVE,Strathcona,49.278221,-123.093622,2012-01-01 16:59:00
12739,Theft of Bicycle,2012,1,1,17,0,11XX W 13TH AVE,Fairview,49.259737,-123.130651,2012-01-01 17:00:00
12740,Theft of Bicycle,2012,1,2,0,1,35XX MOSCROP ST,Renfrew-Collingwood,49.243463,-123.027149,2012-01-02 00:01:00
...,...,...,...,...,...,...,...,...,...,...,...
30974,Theft of Bicycle,2019,10,22,13,45,20XX W 4TH AVE,Kitsilano,49.268138,-123.151383,2019-10-22 13:45:00
30975,Theft of Bicycle,2019,10,22,18,15,27XX W BROADWAY AVE,Kitsilano,49.264101,-123.167048,2019-10-22 18:15:00
30976,Theft of Bicycle,2019,10,22,20,34,63XX YEW ST,Kerrisdale,49.228779,-123.158805,2019-10-22 20:34:00
30977,Theft of Bicycle,2019,10,23,22,48,39XX W 38TH AVE,Dunbar-Southlands,49.237564,-123.191084,2019-10-23 22:48:00


# 7. Bike Theft Prediction

In [3]:
le_neigh = preprocessing.LabelEncoder()
le_block = preprocessing.LabelEncoder()
le_weath = preprocessing.LabelEncoder()

df_class = data.copy(deep=True)
df_class["NEIGHBOURHOOD"] = df_class["NEIGHBOURHOOD"].apply(lambda x: str(x))
df_class["HUNDRED_BLOCK"] = df_class["HUNDRED_BLOCK"].apply(lambda x: str(x))
# df_class["WEATHER"] = df_class["WEATHER"]


le_neigh.fit(df_class["NEIGHBOURHOOD"])
df_class["NEIGHBOURHOOD"] = le_neigh.transform(df_class["NEIGHBOURHOOD"])

le_block.fit(df_class["HUNDRED_BLOCK"])
df_class["HUNDRED_BLOCK"] = le_block.transform(df_class["HUNDRED_BLOCK"])

# le_weath.fit(df_class["WEATHER"])
# df_class["WEATHER"] = le_weath.transform(df_class["WEATHER"])

In [4]:
X = df_class[["MONTH", "DAY", "NEIGHBOURHOOD", "HUNDRED_BLOCK"]]
y = df_class["HOUR"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.80)

# Saved Model Scores

In [17]:
dtc = load('models/DecisionTreeClassifier.joblib')
g = load('models/GaussianNB.joblib') 
rfc = load('models/RandomForestClassifier.joblib') 
gbc = load('models/GradientBoostingClassifier.joblib') 

In [18]:
print("Decision Tree Train: ", dtc.score(X_train, y_train))
print("Decision Tree Valid: ", dtc.score(X_valid, y_valid))

Decision Tree Train:  0.09703133589884552
Decision Tree Valid:  0.08026388125343595


In [19]:
print("Bayes Train: ", g.score(X_train, y_train))
print("Bayes Valid: ",g.score(X_valid, y_valid))

Bayes Train:  0.09098405717427158
Bayes Valid:  0.08431830676195712


In [20]:
print("Random Forest Train: ", rfc.score(X_train, y_train))
print("Random Forest Valid: ", rfc.score(X_valid, y_valid))

Random Forest Train:  0.13056624518966464
Random Forest Valid:  0.08424958768554151


In [21]:
print("Gradient Boost Train: ", gbc.score(X_train, y_train))
print("Gradient Boost Valid: ", gbc.score(X_valid, y_valid))

Gradient Boost Train:  0.33782297965915337
Gradient Boost Valid:  0.07153655854865311


# Model Training

Only run if you want to re-train the model

### Decision Tree

In [5]:
model = DecisionTreeClassifier(max_depth=4)
model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_valid, y_valid))

p_test = {'max_depth':[1,2,3,4,5,6,7,8,9,10]}

tuning = GridSearchCV(estimator=DecisionTreeClassifier(),
                      param_grid = p_test,
                      scoring='accuracy',
                      n_jobs=6,
                      cv=5,
                      iid=False)

tuning.fit(X_train, y_train)
print(tuning.best_params_)

dump(model, 'models/DecisionTreeClassifier.joblib') 

0.09703133589884552
0.08026388125343595
{'max_depth': 1}


['models/DecisionTreeClassifier.joblib']

### Random Forest

In [6]:
p_test = {'max_depth': [1,2,3,4,5,6,7,8,9,10],
         'n_estimators': [10, 20, 30, 40, 50]}

tuning = GridSearchCV(estimator=RandomForestClassifier(),
                      param_grid = p_test,
                      scoring='accuracy',
                      n_jobs=6,
                      cv=5,
                      iid=False)

tuning.fit(X_train, y_train)
print(tuning.best_params_)

model = RandomForestClassifier(n_estimators=tuning.best_params_["n_estimators"],
                               max_depth=tuning.best_params_["max_depth"])
model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_valid, y_valid))

dump(model, 'models/RandomForestClassifier.joblib') 

{'max_depth': 4, 'n_estimators': 40}
0.13056624518966464
0.08424958768554151


['models/RandomForestClassifier.joblib']

### Bayes

In [7]:
model1 = GaussianNB()
model1.fit(X_train, y_train)

print(model1.score(X_train, y_train))
print(model1.score(X_valid, y_valid))

dump(model1, 'models/GaussianNB.joblib') 

0.09098405717427158
0.08431830676195712


['models/GaussianNB.joblib']

### Gradient Boosting

In [16]:
model = GradientBoostingClassifier(n_estimators=40)
model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_valid, y_valid))

dump(model, 'models/GradientBoostingClassifier.joblib') 

0.33782297965915337
0.07153655854865311


['models/GradientBoostingClassifier.joblib']