# Taxi Demand Prediction - Support Vector Machine
---
In this notebook, we build a model for predicting taxi demand in Chicago. The model is based on the information provided by the city of Chicago in 2015.

Furthermore, the model is based on all of the available data. Since the prediction target is taxi demand which is calculated by data aggregation, we won't encounter any memory problems.

To build our demand prediction model, we proceed as following:

In [1]:
cd ..

/Users/simonwolf/git/aaa21


In [2]:
import utils, feature_engineering, geo_engineering, preprocessing, prediction_utils, prediction_svm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Daily Models
---
Explanations...

## Data Preparation
---

In [3]:
# Takes few minutes to run (16 GB RAM)
chicago_df = utils.read_parquet('Taxi_Trips_Cleaned.parquet',
                                columns=['Trip ID','Trip Start Timestamp','Pickup Community Area',
                                         'Dropoff Community Area'])
weather_df = utils.read_parquet('Weather.parquet',columns = ['Trip Start Timestamp','Humidity(%)',
                                    'Pressure(hPa)','Temperature(C)',
                                    'Wind Direction(Meteoro. Degree)','Wind Speed(M/S)'])

daily_demand = preprocessing.create_aggregated_data(df=chicago_df,weather_df=weather_df,temporal_resolution='D')
#daily_demand_hex_7 = preprocessing.create_aggregated_data(df=chicago_df,weather_df=weather_df,temporal_resolution='D',
#                                            use_hexes=True,hex_resolution=7)
#daily_demand_hex_6 = preprocessing.create_aggregated_data(df=chicago_df,weather_df=weather_df,temporal_resolution='D',
#                                            use_hexes=True,hex_resolution=6)

del chicago_df
del weather_df

### Daily Model - Community Areas
---

In [4]:
daily_demand

Unnamed: 0,Trip Start Timestamp,Pickup Community Area,Demand (D),Humidity(%),Pressure(hPa),Temperature(C),Wind Direction(Meteoro. Degree),Wind Speed(M/S)
0,2015-01-01,1.0,406,100.000000,1034.250777,-5.435774,249.095220,9.069681
1,2015-01-01,10.0,32,100.000000,1034.250777,-5.435774,249.095220,9.069681
2,2015-01-01,11.0,73,100.000000,1034.250777,-5.435774,249.095220,9.069681
3,2015-01-01,12.0,15,100.000000,1034.250777,-5.435774,249.095220,9.069681
4,2015-01-01,13.0,39,100.000000,1034.250777,-5.435774,249.095220,9.069681
...,...,...,...,...,...,...,...,...
25240,2015-12-31,73.0,1,75.061278,1024.788420,-2.917027,265.509481,5.001468
25241,2015-12-31,75.0,1,75.061278,1024.788420,-2.917027,265.509481,5.001468
25242,2015-12-31,76.0,1450,75.061278,1024.788420,-2.917027,265.509481,5.001468
25243,2015-12-31,77.0,737,75.061278,1024.788420,-2.917027,265.509481,5.001468


In [5]:
import datetime

df=daily_demand
tscv = prediction_svm.TimeBasedCV(train_period=333,
                   test_period=31,
                   freq='days')

y = df.pop("Demand (D)")
X = df

In [6]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import math

scores = []
for train_index, test_index in tscv.split(X):

    data_train   = X.loc[train_index].drop('Trip Start Timestamp', axis=1)
    target_train = y.loc[train_index]

    data_test    = X.loc[test_index].drop('Trip Start Timestamp', axis=1)
    target_test  = y.loc[test_index]
    
    regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.1)) # Optimize Parameters
    regr.fit(data_train, target_train)
    
    prediction = regr.predict(data_test)
    
    r2score = regr.score(data_test, target_test)
    
    scores.append(r2score)
    
    y_pred = regr.predict(data_test)
    print("-------MODEL SCORES-------")
    print(f"MAE: {metrics.mean_absolute_error(target_test, y_pred): .3f}")
    print(f"MSE: {metrics.mean_squared_error(target_test, y_pred): .3f}")
    print(f"RMSE: {math.sqrt(metrics.mean_squared_error(target_test, y_pred)): .3f}")
    print(f"R2: {100 * metrics.r2_score(target_test, y_pred): .3f} %")
    
# this is the average accuracy over all folds
average_r2score = np.mean(scores)
#print(average_r2score)

In [4]:
# Second approach from https://medium.com/keita-starts-data-science/time-series-split-with-scikit-learn-74f5be38489e
y = daily_demand.pop("Demand (D)")
X = daily_demand.drop(["Trip Start Timestamp"], axis=1)

X_train = X[:int(X.shape[0]*0.8)] # for 0.7 its the 13.09.2015 and for 0.8 its the 19.10.2015
X_test = X[int(X.shape[0]*0.8):]
y_train = y[:int(X.shape[0]*0.8)]
y_test = y[int(X.shape[0]*0.8):]

In [None]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.svm import SVR
from tqdm import tqdm_notebook

tscv = TimeSeriesSplit(n_splits=5)
scores = []
for tr_index, val_index in tqdm_notebook(tscv.split(X_train)):
    print("TRAIN:", tr_index, "VALIDATION:", val_index)
    X_tr, X_val = X_train.loc[tr_index], X_train.loc[val_index]
    y_tr, y_val = y_train.loc[tr_index], y_train.loc[val_index]
        
    parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10, 100]}
    svr = SVR()
    clf = GridSearchCV(svr, parameters)
    clf.fit(X_tr, y_tr)

0it [00:00, ?it/s]

TRAIN: [   0    1    2 ... 3363 3364 3365] VALIDATION: [3366 3367 3368 ... 6729 6730 6731]


In [18]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,30.099232,3.150788,0.915923,0.013788,1,linear,"{'C': 1, 'kernel': 'linear'}",-0.06718,-0.066855,-0.066889,-0.067056,-0.062898,-0.066176,0.001643,3
1,8.443325,0.100911,4.071352,0.107611,1,rbf,"{'C': 1, 'kernel': 'rbf'}",-0.076213,-0.074807,-0.074842,-0.075839,-0.072159,-0.074772,0.001418,6
2,41.320247,2.700932,0.927999,0.007206,10,linear,"{'C': 10, 'kernel': 'linear'}",-0.04141,-0.030557,-0.035402,-0.039948,-0.034882,-0.03644,0.003876,2
3,8.258576,0.029741,4.054072,0.048625,10,rbf,"{'C': 10, 'kernel': 'rbf'}",-0.072812,-0.071688,-0.071718,-0.072721,-0.068688,-0.071525,0.001497,5
4,45.95101,7.58373,0.936804,0.023408,100,linear,"{'C': 100, 'kernel': 'linear'}",-0.003912,-0.045508,0.030224,0.017465,0.013902,0.002434,0.026338,1
5,8.702239,0.074813,4.210126,0.063865,100,rbf,"{'C': 100, 'kernel': 'rbf'}",-0.069865,-0.069127,-0.069297,-0.070177,-0.065951,-0.068883,0.001514,4
