## SVM Model

In [1]:
print("Importing Libraries...")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
import time
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import math
from sklearn.metrics import r2_score
from sklearn.utils import shuffle
import ast
print("Successfully imported!")

Importing Libraries...
Successfully imported!


### Prepare dataset for fitting in model

In [2]:
print("Reading Datafile...")
checkpoint5 = pd.read_csv('checkpoint-5.csv')
print("Success!")

Reading Datafile...
Success!


In [3]:
print("Reading index of train data...") 
with open("train_idx.txt") as file:
    train_idx_str = file.read()
    file.close()

train_idx = ast.literal_eval(train_idx_str)
print("Length of the train indexes is...", len(train_idx))
print(train_idx[0])

Reading index of train data...
Length of the train indexes is... 540000
17094116


In [4]:
print("Getting dataframe of train set...")
train_set = checkpoint5.loc[train_idx]
checkpoint5.drop(axis=0, index=train_idx,inplace=True)
print("Seperated dataframe for both train and test")

Getting dataframe of train set...
Seperated dataframe for both train and test


In [5]:
print("Obtaining labels for train set...")
cols = 'fare_amount'
y_train = train_set[[cols]]
print("Obtained labels!")
del train_set['time']
del train_set['fare_amount']
del train_set['pickup_longitude']
del train_set['pickup_latitude']
del train_set['dropoff_longitude']
del train_set['dropoff_latitude']
print("Train set is ready!")

Obtaining labels for train set...
Obtained labels!
Train set is ready!


In [6]:
X_train = train_set

In [7]:
X_train

Unnamed: 0,passenger_count,day,weekend,holiday,peak_hour,hotspot,dist
17094116,1,3,False,0,0,1,0.696993
53649197,1,5,True,0,0,0,1.594699
38088232,5,1,False,0,1,0,3.154141
29785639,1,2,False,0,0,0,2.620846
24305630,1,1,False,0,0,0,6.734819
...,...,...,...,...,...,...,...
12888383,1,6,True,0,0,1,4.731678
38936098,1,3,False,0,1,0,3.661563
189012,2,6,True,0,0,0,4.128879
37063756,5,3,False,0,1,0,18.904935


In [8]:
y_train

Unnamed: 0,fare_amount
17094116,4.50
53649197,4.90
38088232,10.90
29785639,7.30
24305630,27.00
...,...
12888383,14.50
38936098,11.70
189012,6.10
37063756,57.54


### Test set preparation 100000 rows only

In [9]:
print("Obtaining samples from huge test set...")
test_set = checkpoint5.sample(n=100000, replace=False)
print("Obtained test set!")

Obtaining samples from huge test set...
Obtained test set!


In [10]:
print("Obtaining labels for test set...")
cols = 'fare_amount'
y_test = test_set[[cols]]
print("Obtained labels!")
del test_set['time']
del test_set['fare_amount']
del test_set['pickup_longitude']
del test_set['pickup_latitude']
del test_set['dropoff_longitude']
del test_set['dropoff_latitude']
print("Test set is ready!")

Obtaining labels for test set...
Obtained labels!
Test set is ready!


In [11]:
X_test = test_set

In [12]:
X_test

Unnamed: 0,passenger_count,day,weekend,holiday,peak_hour,hotspot,dist
14591719,1,4,True,0,1,0,7.947724
36952791,2,6,True,0,0,1,1.285748
22213999,2,6,True,0,0,0,20.347781
29023062,2,5,True,0,0,0,15.778842
54099570,1,2,False,0,0,0,7.334733
...,...,...,...,...,...,...,...
18651558,2,5,True,0,0,1,1.170345
35119203,5,0,False,0,1,0,2.048611
17219590,1,3,False,0,0,0,1.764260
6776747,1,4,True,0,0,0,2.816526


In [13]:
y_test

Unnamed: 0,fare_amount
14591719,25.5
36952791,7.0
22213999,49.8
29023062,30.5
54099570,19.5
...,...
18651558,7.0
35119203,8.1
17219590,7.0
6776747,10.0


#### Load Model

In [14]:
with open('WithDist_SVM/Dist_SVM_Experiment_540000_v2.pkl', 'rb') as f:

    regressor = pickle.load(f)

In [15]:
print("Fitting...")
X_sc = StandardScaler()
y_sc = StandardScaler()
X_train = X_sc.fit_transform(X_train)
y_train = y_sc.fit_transform(y_train)

Fitting...


In [None]:
print('Predicting...')
t1 = time.time()
y_pred = regressor.predict(X_sc.transform(X_test))
t2 = time.time()
print("Inversely Transforming value data ")
y_pred = y_sc.inverse_transform(y_pred.reshape(-1, 1))
print('Predicted all!')
print(y_pred)

In [None]:
y_test = y_test.to_numpy()

In [None]:
y_test = y_test.flatten()
y_pred = y_pred.flatten()
y_test_series = pd.Series(y_test)
y_pred_series = pd.Series(y_pred)
df = pd.DataFrame({'Predicted value': y_pred, 'Real Value': y_test})
df
df.to_csv('WithDist_SVM/Dist_predictions_SVR540000_100000.csv',index=False )

In [None]:
# time_train = t1-t0
time_predict = t2-t1

#Get Report
print("Score for classifier:", r2_score(y_test,y_pred))
# print ("Time taken to train:", time_train)
print("Time taken to predict",time_predict)
print("Mean square error is -> ",mean_squared_error(y_test, y_pred))
print("Root Mean square error is -> ",math.sqrt(mean_squared_error(y_test, y_pred)))
print("Mean absolute error is -> ",mean_absolute_error(y_test, y_pred))

# Save report 
report = { 
    'version': 'SVM with Dist',
    'number_rows': 100000,
    # 'train time':time_train,
    'predict time': time_predict,
    'r2_score': r2_score(y_test,y_pred),
    'MSE': mean_squared_error(y_test, y_pred),
    'RMSE': math.sqrt(mean_squared_error(y_test, y_pred)),
    'MAE': mean_absolute_error(y_test, y_pred)}

final_report = pd.DataFrame.from_dict(report.items())
final_report.to_csv('WithDist_SVM/report_SVR540000_100000.csv',index=False )

In [None]:
# with open('WithDist_SVM/Dist_SVM_Experiment_540000.pkl', 'rb') as f:

#    regressor = pickle.load(f)

In [None]:
# X_sc = StandardScaler()
# y_sc = StandardScaler()
# print('Predicting...')
# y_pred = regressor.predict(X_sc.transform(X_test))
# t2 = time.time()
# y_pred = y_sc.inverse_transform(y_pred.reshape(-1, 1))
# print('Predicted all!')
# print(y_pred)