In [532]:
import numpy as np 
import math
# data processing
import pandas as pd 

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn import tree

#Varrible selection
from statsmodels.graphics.correlation import plot_corr
from statsmodels.formula.api import ols
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV

In [533]:
# the csv files are converted to panda dataframe and are renamed.
Train_df= pd.read_csv(r"C:\Users\27833\Downloads\Train.csv")
Test_df= (pd.read_csv(r"C:\Users\27833\Downloads\Test.csv"))
Riders_df= pd.read_csv(r"C:\Users\27833\Downloads\Riders.csv")
VariableDefinitions_df= pd.read_csv(r"C:\Users\27833\Downloads\VariableDefinitions.csv")
SampleSubmission_df= pd.read_csv(r"C:\Users\27833\Downloads\SampleSubmission.csv")

In [534]:
    #Joining the riders to both Train and Test data set
Train_df = pd.merge(Train_df, Riders_df, left_on='Rider Id', 
                    right_on='Rider Id', how='left')
Test_df = pd.merge(Test_df, Riders_df, left_on='Rider Id',
                    right_on='Rider Id', how='left')

In [535]:
    # Formatting the naming of the columns
Train_df.columns = Train_df.columns.str.replace(' ', '_')
Test_df.columns = Test_df .columns.str.replace(' ', '_')

    #removing "-" from the feature labels.
Train_df.columns = Train_df.columns.str.replace('_-_', '_')
Test_df.columns = Test_df .columns.str.replace('_-_', '_')

In [536]:
Train_df = Train_df.drop(['Vehicle_Type', 
                          'Arrival_at_Destination_Day_of_Month',
                          'Arrival_at_Destination_Weekday_(Mo_=_1)',
                          'Arrival_at_Destination_Time'], axis = 1)
Test_df = Test_df.drop(['Vehicle_Type'], axis = 1)

In [537]:
    #replacing NAN with 0 for precipitation feature
Train_df["Precipitation_in_millimeters"] = Train_df["Precipitation_in_millimeters"].fillna(0)
Test_df["Precipitation_in_millimeters"] = Test_df["Precipitation_in_millimeters"].fillna(0)

In [538]:
    #repacing the missing temparature with the mean
Train_df = Train_df.fillna(Train_df.mean())
Test_df = Test_df.fillna(Test_df.mean())

In [539]:
def alter_time(df):
    time_matrix = ['Placement_Time','Confirmation_Time', 
                   'Arrival_at_Pickup_Time', 'Pickup_Time']
    for i in time_matrix:
        df[i] = pd.to_datetime(df[i]).dt.strftime('%H:%M:%S')
        df[i] = pd.to_timedelta(df[i])
        df[i] = df[i].dt.total_seconds()
        
    return df

In [540]:
Train_df = alter_time(Train_df)
Test_df = alter_time(Test_df)

## Time Engineered to Represent Circular Format 

In [541]:
Train_df.head()

Unnamed: 0,Order_No,User_Id,Platform_Type,Personal_or_Business,Placement_Day_of_Month,Placement_Weekday_(Mo_=_1),Placement_Time,Confirmation_Day_of_Month,Confirmation_Weekday_(Mo_=_1),Confirmation_Time,...,Pickup_Lat,Pickup_Long,Destination_Lat,Destination_Long,Rider_Id,Time_from_Pickup_to_Arrival,No_Of_Orders,Age,Average_Rating,No_of_Ratings
0,Order_No_4211,User_Id_633,3,Business,9,5,34546.0,9,5,34810.0,...,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745,1637,1309,13.8,549
1,Order_No_25375,User_Id_2285,3,Personal,12,5,40576.0,12,5,41001.0,...,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993,396,339,13.6,69
2,Order_No_1899,User_Id_265,3,Business,30,2,45565.0,30,2,45764.0,...,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455,1023,242,12.5,114
3,Order_No_9336,User_Id_1402,3,Business,15,5,33934.0,15,5,33965.0,...,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341,886,283,14.5,113
4,Order_No_27883,User_Id_1737,1,Personal,13,1,35718.0,13,1,35778.0,...,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214,2311,872,14.1,533


In [542]:
# split dataset into train and test sets
Ix_train, Ix_test, Iy_train, Iy_test = train_test_split(Train_df,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=1,
                                                    shuffle=False)

In [543]:
Ix_test.head()

Unnamed: 0,Order_No,User_Id,Platform_Type,Personal_or_Business,Placement_Day_of_Month,Placement_Weekday_(Mo_=_1),Placement_Time,Confirmation_Day_of_Month,Confirmation_Weekday_(Mo_=_1),Confirmation_Time,...,Pickup_Lat,Pickup_Long,Destination_Lat,Destination_Long,Rider_Id,Time_from_Pickup_to_Arrival,No_Of_Orders,Age,Average_Rating,No_of_Ratings
16960,Order_No_17313,User_Id_3647,3,Business,5,5,54026.0,5,5,54100.0,...,-1.272639,36.794723,-1.300484,36.770491,Rider_Id_99,1143,435,301,13.5,73
16961,Order_No_11495,User_Id_186,3,Business,7,3,54723.0,7,3,54768.0,...,-1.370038,36.919017,-1.257147,36.795063,Rider_Id_208,3115,779,873,14.2,89
16962,Order_No_262,User_Id_2200,3,Business,29,5,35728.0,29,5,35869.0,...,-1.233018,36.800448,-1.290894,36.822971,Rider_Id_177,2124,526,286,13.6,70
16963,Order_No_23022,User_Id_3295,3,Business,23,4,43244.0,23,4,43295.0,...,-1.257147,36.795063,-1.31553,36.863732,Rider_Id_131,1840,2183,887,14.1,393
16964,Order_No_1438,User_Id_246,3,Business,10,4,38805.0,10,4,38848.0,...,-1.258414,36.8048,-1.255956,36.789867,Rider_Id_188,764,3837,2018,14.1,786


In [544]:
 #categorical data (all id and order numbers)


In [545]:
Train_df = Train_df.drop(['Order_No', 'User_Id', 'Rider_Id'], axis = 1)

In [586]:
Train_df.head(2)

Unnamed: 0,Platform_Type,Personal_or_Business,Placement_Day_of_Month,Placement_Weekday_(Mo_=_1),Placement_Time,Confirmation_Day_of_Month,Confirmation_Weekday_(Mo_=_1),Confirmation_Time,Arrival_at_Pickup_Day_of_Month,Arrival_at_Pickup_Weekday_(Mo_=_1),...,Precipitation_in_millimeters,Pickup_Lat,Pickup_Long,Destination_Lat,Destination_Long,Time_from_Pickup_to_Arrival,No_Of_Orders,Age,Average_Rating,No_of_Ratings
0,3,Business,9,5,34546.0,9,5,34810.0,9,5,...,0.0,-1.317755,36.83037,-1.300406,36.829741,745,1637,1309,13.8,549
1,3,Personal,12,5,40576.0,12,5,41001.0,12,5,...,0.0,-1.351453,36.899315,-1.295004,36.814358,1993,396,339,13.6,69


## Encoding Personal or Business

In [547]:
#Encoding personal or business as it is shown to influence y-prediction

In [548]:
df_dummies = pd.get_dummies(Train_df)

# Making sure that all the column names have correct format
df_dummies.columns = [col.replace(" ", "_") for col in df_dummies.columns]
df_dummies.columns = [col.replace("(Mo_=_1)","Mo_1") for col in df_dummies.columns]
df_dummies.columns = [col.replace("(KM)","KM") for col in df_dummies.columns]
# Test_df
Test_df.columns = [col.replace(" ", "_") for col in Test_df.columns]
Test_df.columns = [col.replace("(Mo_=_1)","Mo_1") for col in Test_df.columns]
Test_df.columns = [col.replace("(KM)","KM") for col in Test_df.columns]

# Reorder columns with the dependent variable (claim_amount) the last column
column_titles = [col for col in df_dummies.columns if col !=
                 'Time_from_Pickup_to_Arrival'] + ['Time_from_Pickup_to_Arrival']
df_dummies = df_dummies.reindex(columns=column_titles)

df_dummies.head()

Unnamed: 0,Platform_Type,Placement_Day_of_Month,Placement_Weekday_Mo_1,Placement_Time,Confirmation_Day_of_Month,Confirmation_Weekday_Mo_1,Confirmation_Time,Arrival_at_Pickup_Day_of_Month,Arrival_at_Pickup_Weekday_Mo_1,Arrival_at_Pickup_Time,...,Pickup_Long,Destination_Lat,Destination_Long,No_Of_Orders,Age,Average_Rating,No_of_Ratings,Personal_or_Business_Business,Personal_or_Business_Personal,Time_from_Pickup_to_Arrival
0,3,9,5,34546.0,9,5,34810.0,9,5,36287.0,...,36.83037,-1.300406,36.829741,1637,1309,13.8,549,1,0,745
1,3,12,5,40576.0,12,5,41001.0,12,5,42022.0,...,36.899315,-1.295004,36.814358,396,339,13.6,69,0,1,1993
2,3,30,2,45565.0,30,2,45764.0,30,2,46174.0,...,36.843419,-1.300921,36.828195,1023,242,12.5,114,1,0,455
3,3,15,5,33934.0,15,5,33965.0,15,5,34676.0,...,36.832396,-1.257147,36.795063,886,283,14.5,113,1,0,1341
4,1,13,1,35718.0,13,1,35778.0,13,1,36233.0,...,36.792118,-1.295041,36.809817,2311,872,14.1,533,0,1,1214


In [549]:
df_dummies = pd.get_dummies(Train_df, drop_first = True)

In [550]:
df_dummies.head(2)

Unnamed: 0,Platform_Type,Placement_Day_of_Month,Placement_Weekday_(Mo_=_1),Placement_Time,Confirmation_Day_of_Month,Confirmation_Weekday_(Mo_=_1),Confirmation_Time,Arrival_at_Pickup_Day_of_Month,Arrival_at_Pickup_Weekday_(Mo_=_1),Arrival_at_Pickup_Time,...,Pickup_Lat,Pickup_Long,Destination_Lat,Destination_Long,Time_from_Pickup_to_Arrival,No_Of_Orders,Age,Average_Rating,No_of_Ratings,Personal_or_Business_Personal
0,3,9,5,34546.0,9,5,34810.0,9,5,36287.0,...,-1.317755,36.83037,-1.300406,36.829741,745,1637,1309,13.8,549,0
1,3,12,5,40576.0,12,5,41001.0,12,5,42022.0,...,-1.351453,36.899315,-1.295004,36.814358,1993,396,339,13.6,69,1


In [551]:
def sincos_time(Train_df):
    seconds_in_day = 24*60*60    
    time_matrix = ['Placement_Time', 'Confirmation_Time', 
                   'Arrival_at_Pickup_Time', 'Pickup_Time']
    for i in time_matrix:
        Train_df[str(i) +'sin_time'] = np.sin(2*np.pi*Train_df[i]/seconds_in_day)
        Train_df[str(i) +'cos_time'] = np.cos(2*np.pi*Train_df[i]/seconds_in_day)
        #Train_df = Train_df.drop([i], axis=1, inplace = True)
    return Train_df

In [552]:
sincos_time(df_dummies)

Unnamed: 0,Platform_Type,Placement_Day_of_Month,Placement_Weekday_(Mo_=_1),Placement_Time,Confirmation_Day_of_Month,Confirmation_Weekday_(Mo_=_1),Confirmation_Time,Arrival_at_Pickup_Day_of_Month,Arrival_at_Pickup_Weekday_(Mo_=_1),Arrival_at_Pickup_Time,...,No_of_Ratings,Personal_or_Business_Personal,Placement_Timesin_time,Placement_Timecos_time,Confirmation_Timesin_time,Confirmation_Timecos_time,Arrival_at_Pickup_Timesin_time,Arrival_at_Pickup_Timecos_time,Pickup_Timesin_time,Pickup_Timecos_time
0,3,9,5,34546.0,9,5,34810.0,9,5,36287.0,...,549,0,0.588609,-0.808418,0.572981,-0.819569,0.481817,-0.876272,0.392738,-0.919650
1,3,12,5,40576.0,12,5,41001.0,12,5,42022.0,...,69,1,0.189667,-0.981849,0.159235,-0.987241,0.085562,-0.996333,0.069104,-0.997609
2,3,30,2,45565.0,30,2,45764.0,30,2,46174.0,...,114,0,-0.171141,-0.985247,-0.185381,-0.982667,-0.214593,-0.976703,-0.229413,-0.973329
3,3,15,5,33934.0,15,5,33965.0,15,5,34676.0,...,113,0,0.623993,-0.781430,0.622230,-0.782834,0.580940,-0.813947,0.562444,-0.826835
4,1,13,1,35718.0,13,1,35778.0,13,1,36233.0,...,533,1,0.517654,-0.855590,0.513916,-0.857841,0.485255,-0.874373,0.479522,-0.877530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21196,3,20,3,57278.0,20,3,57309.0,20,3,57529.0,...,131,1,-0.854081,-0.520140,-0.855251,-0.518214,-0.863432,-0.504465,-0.906554,-0.422091
21197,3,13,6,36814.0,13,6,36821.0,13,6,37204.0,...,114,0,0.447889,-0.894089,0.447434,-0.894317,0.422355,-0.906431,0.368733,-0.929535
21198,3,7,4,61576.0,7,4,61629.0,7,4,63017.0,...,29,0,-0.972641,-0.232314,-0.973529,-0.228563,-0.991605,-0.129300,-0.999206,-0.039841
21199,1,4,3,34299.0,4,3,34313.0,4,3,34739.0,...,1101,1,0.603034,-0.797716,0.602221,-0.798329,0.577205,-0.816600,0.554663,-0.832075


In [553]:
df_dummies.columns

Index(['Platform_Type', 'Placement_Day_of_Month', 'Placement_Weekday_(Mo_=_1)',
       'Placement_Time', 'Confirmation_Day_of_Month',
       'Confirmation_Weekday_(Mo_=_1)', 'Confirmation_Time',
       'Arrival_at_Pickup_Day_of_Month', 'Arrival_at_Pickup_Weekday_(Mo_=_1)',
       'Arrival_at_Pickup_Time', 'Pickup_Day_of_Month',
       'Pickup_Weekday_(Mo_=_1)', 'Pickup_Time', 'Distance_(KM)',
       'Temperature', 'Precipitation_in_millimeters', 'Pickup_Lat',
       'Pickup_Long', 'Destination_Lat', 'Destination_Long',
       'Time_from_Pickup_to_Arrival', 'No_Of_Orders', 'Age', 'Average_Rating',
       'No_of_Ratings', 'Personal_or_Business_Personal',
       'Placement_Timesin_time', 'Placement_Timecos_time',
       'Confirmation_Timesin_time', 'Confirmation_Timecos_time',
       'Arrival_at_Pickup_Timesin_time', 'Arrival_at_Pickup_Timecos_time',
       'Pickup_Timesin_time', 'Pickup_Timecos_time'],
      dtype='object')

In [582]:
df_dummies = df_dummies.drop(['Placement_Time', 'Confirmation_Time', 
                   'Arrival_at_Pickup_Time', 'Pickup_Time', 'Time_from_Pickup_to_Arrival'], axis = 1)

In [583]:
df_dummies.tail(2)

Unnamed: 0,Platform_Type,Placement_Day_of_Month,Placement_Weekday_(Mo_=_1),Confirmation_Day_of_Month,Confirmation_Weekday_(Mo_=_1),Arrival_at_Pickup_Day_of_Month,Arrival_at_Pickup_Weekday_(Mo_=_1),Pickup_Day_of_Month,Pickup_Weekday_(Mo_=_1),Distance_(KM),...,No_of_Ratings,Personal_or_Business_Personal,Placement_Timesin_time,Placement_Timecos_time,Confirmation_Timesin_time,Confirmation_Timecos_time,Arrival_at_Pickup_Timesin_time,Arrival_at_Pickup_Timecos_time,Pickup_Timesin_time,Pickup_Timecos_time
21199,1,4,3,4,3,4,3,4,3,13,...,1101,1,0.603034,-0.797716,0.602221,-0.798329,0.577205,-0.8166,0.554663,-0.832075
21200,3,26,2,26,2,26,2,26,2,12,...,70,0,-0.572802,-0.819694,-0.573636,-0.81911,-0.58949,-0.807776,-0.649172,-0.760642


In [556]:
X = df_dummies.drop(["Time_from_Pickup_to_Arrival"], axis=1).values

In [557]:
y = np.array(df_dummies.loc[:,["Time_from_Pickup_to_Arrival"]])

In [558]:
y = y.reshape(len(y), 1)

In [559]:
from sklearn.preprocessing import StandardScaler

In [560]:
scaler = StandardScaler()

In [561]:
X_scaled = scaler.fit_transform(X)

In [562]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [563]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_dummies,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=1,
                                                    shuffle=False)

In [564]:
from sklearn.svm import SVR

In [574]:
def s_v_m(X_train, y_train, X_test):
    regressor = SVR(kernel = 'rbf')
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    return regressor, y_pred

In [575]:
from sklearn import preprocessing

In [579]:
model, y_pred = s_v_m(X_train, y_train, X_test)

  return f(**kwargs)


In [580]:
y_pred

array([1378.97774599, 1384.79312456, 1370.74058788, ..., 1386.10537308,
       1365.12179144, 1380.62374987])

In [578]:
y_pred = y_pred.reshape(len(y_pred), 1)

AttributeError: 'tuple' object has no attribute 'reshape'

In [570]:
y_pred = pd.DataFrame(data=y_pred, columns= ['Time From Pickup To Arrival'], index = Ix_test['Order_No'])
y_pred.reset_index(inplace = True)

In [571]:
y_pred

Unnamed: 0,Order_No,Time From Pickup To Arrival
0,Order_No_17313,1378.977746
1,Order_No_11495,1384.793125
2,Order_No_262,1370.740588
3,Order_No_23022,1374.443078
4,Order_No_1438,1367.829744
...,...,...
4236,Order_No_8834,1376.743980
4237,Order_No_22892,1365.913175
4238,Order_No_2831,1386.105373
4239,Order_No_6174,1365.121791


In [573]:
y_pred.to_csv('svm_timefeature')