# Multiple Linear Regression

## Importing the libraries

In [231]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats

## Importing the dataset

In [232]:
Train = pd.read_csv('Train.csv')
Riders = pd.read_csv('Riders.csv')
Test = pd.read_csv('Test.csv')

In [233]:
#Merging the Train set and Riders information
total = Train.merge(Riders, how='left', on='Rider Id')

In [234]:
#Creating New Column(I want Dependant variable to be last column of the dataframe)
total['Time from Pickup to Arrival New']= total['Time from Pickup to Arrival']

#Dropping the Old-duplicate Y column
total = total.drop(['Time from Pickup to Arrival'], axis=1)

#Renaming the column back to Time from Pickup to Arrival
total.rename(columns={'Time from Pickup to Arrival New':'Time from Pickup to Arrival'}, inplace=True)

In [235]:
#Dropping other unneeded columns [Vehihle Type(Only Bikes), User Id(Object)]
total = total.drop(['Vehicle Type'], axis=1)
total = total.drop(['User Id'], axis=1)

In [236]:
total.head(2)

Unnamed: 0,Order No,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),Confirmation - Time,Arrival at Pickup - Day of Month,...,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,No_Of_Orders,Age,Average_Rating,No_of_Ratings,Time from Pickup to Arrival
0,Order_No_4211,3,Business,9,5,9:35:46 AM,9,5,9:40:10 AM,9,...,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,1637,1309,13.8,549,745
1,Order_No_25375,3,Personal,12,5,11:16:16 AM,12,5,11:23:21 AM,12,...,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,396,339,13.6,69,1993


In [237]:
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21201 entries, 0 to 21200
Data columns (total 31 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Order No                                   21201 non-null  object 
 1   Platform Type                              21201 non-null  int64  
 2   Personal or Business                       21201 non-null  object 
 3   Placement - Day of Month                   21201 non-null  int64  
 4   Placement - Weekday (Mo = 1)               21201 non-null  int64  
 5   Placement - Time                           21201 non-null  object 
 6   Confirmation - Day of Month                21201 non-null  int64  
 7   Confirmation - Weekday (Mo = 1)            21201 non-null  int64  
 8   Confirmation - Time                        21201 non-null  object 
 9   Arrival at Pickup - Day of Month           21201 non-null  int64  
 10  Arrival at Pickup - We

In [238]:
X = total.iloc[:, :-1].values
y = total.iloc[:, -1].values

# Taking care of missing data

In [240]:
#Distance and Temperature -Replace null values with mean of the column
from sklearn.impute import SimpleImputer
imp_temp = SimpleImputer(missing_values = np.nan, strategy='mean')
imp_temp.fit(total.iloc[:, 18:20]) 
total.iloc[:, 18:20] = imp_temp.transform(total.iloc[:, 18:20])

In [241]:
#Precipitation(ml) - Replace null values with Zero(0 ml railfall)
from sklearn.impute import SimpleImputer
imp_prec = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 0)
imp_prec.fit(total.iloc[:,20:]) 
total.iloc[:,20:] = imp_prec.transform(total.iloc[:,20:])

# Changing datatype


In [242]:
#Changing times columns into datetime datatype
total['Placement - Time'] = pd.to_datetime(total['Placement - Time'])
total['Confirmation - Time'] = pd.to_datetime(total['Confirmation - Time'])
total['Arrival at Pickup - Time'] = pd.to_datetime(total['Arrival at Pickup - Time'])
total['Pickup - Time'] = pd.to_datetime(total['Pickup - Time'])
total['Arrival at Destination - Time'] = pd.to_datetime(total['Arrival at Destination - Time'])

## Encoding categorical data

In [243]:
#1 Hot Encode the Personal or Business column - Business=1 and Personal=0
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

# Correlation Coefficients and P-values 

In [290]:
#Trying to see how variables contribute to the Y_predicts. P_value<0.05 is decent important
corrs = total.corr()['Time from Pickup to Arrival'].sort_values(ascending=False)

In [301]:
from scipy.stats import pearsonr

# Build a dictionary of correlation coefficients and p-values
dict_cp = {}

column_titles = [col for col in corrs.index if col!= 'Time from Pickup to Arrival']
for col in column_titles:
    p_val = round(pearsonr(total[col], total['Time from Pickup to Arrival'])[0],6)
    dict_cp[col] = {'Correlation_Coefficient':corrs[col],
                    'P_Value':p_val}
    
df_cp = pd.DataFrame(dict_cp).T
df_cp_sorted = df_cp.sort_values('P_Value')
df_cp_sorted[df_cp_sorted['P_Value']<0.1]

Unnamed: 0,Correlation_Coefficient,P_Value
Destination Lat,-0.061872,-0.061872
Pickup Lat,-0.053823,-0.053823
Average_Rating,-0.042559,-0.042559
No_Of_Orders,-0.041608,-0.041608
Placement - Day of Month,-0.01471,-0.01471
Confirmation - Day of Month,-0.014701,-0.014701
Arrival at Pickup - Day of Month,-0.014701,-0.014701
Pickup - Day of Month,-0.014701,-0.014701
Arrival at Destination - Day of Month,-0.014701,-0.014701
Age,-0.007969,-0.007969


# Drop columns not to be trained with the model

In [270]:
#Drop columns/variables not wanted to be part of the model training
total_model = total.iloc[:, 12:]
total_model.drop('Rider Id', axis=1, inplace=True)
total_model.drop('Arrival at Destination - Time', axis=1, inplace=True)
total_model.drop('Pickup - Time', axis=1, inplace=True)
#total_model.drop('Pickup Long', axis=1, inplace=True)
#total_model.drop('Pickup Lat', axis=1, inplace=True)
#total_model.drop('Destination Lat', axis=1, inplace=True)
#total_model.drop('Destination Long', axis=1, inplace=True)
#total_model.drop('No_Of_Orders', axis=1, inplace=True)
total_model.drop('Age', axis=1, inplace=True)
#total_model.drop('Average_Rating', axis=1, inplace=True)
total_model.drop('No_of_Ratings', axis=1, inplace=True)

#Create Metrics of features(Independant Variables- X) and Dependant Variable(y)
X = total_model.iloc[:,:-1]
y = total_model.iloc[:,-1]

total_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21201 entries, 0 to 21200
Data columns (total 14 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Pickup - Day of Month                      21201 non-null  int64  
 1   Pickup - Weekday (Mo = 1)                  21201 non-null  int64  
 2   Arrival at Destination - Day of Month      21201 non-null  int64  
 3   Arrival at Destination - Weekday (Mo = 1)  21201 non-null  int64  
 4   Distance (KM)                              21201 non-null  float64
 5   Temperature                                21201 non-null  float64
 6   Precipitation in millimeters               21201 non-null  float64
 7   Pickup Lat                                 21201 non-null  float64
 8   Pickup Long                                21201 non-null  float64
 9   Destination Lat                            21201 non-null  float64
 10  Destination Long      

# Feature scaling for the whole dataset 

In [297]:
#feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X[:])

## Training the Regression model on the WHOLE TRAIN DATASET

In [298]:
# Multiple Linear regression model 
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()
mlr = mlr.fit(X, y)

In [299]:
#Decision tree regression model 
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree = tree.fit(X, y)


In [274]:
#Random forest regression model 
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 25)
forest = forest.fit(X, y)

## Predicting the dataset results

In [275]:
from sklearn import metrics
import math

#Multiple Linear Regression
y_pred_mlr = mlr.predict(X)
rmse_mlr = round(math.sqrt(metrics.mean_squared_error(y, mlr.predict(X))),2)


In [276]:
# Decision Tree regression
y_pred_tree = tree.predict(X)
rmse_tree = round(math.sqrt(metrics.mean_squared_error(y, tree.predict(X))),2)


In [277]:
# Random forrest regression 
y_pred_forest = forest.predict(X)
rmse_forest = round(math.sqrt(metrics.mean_squared_error(y, forest.predict(X))),2)


In [279]:
data = {'Regression model':  ['MLR', 'D.Tree', 'R.Forest']
        ,'RMSE' : [rmse_mlr, rmse_tree, rmse_forest],
       }

df = pd.DataFrame (data, columns = ['Regression model', 'RMSE'])
print(df)

  Regression model    RMSE
0              MLR  799.04
1           D.Tree    7.20
2         R.Forest  306.63


In [253]:
#from sklearn.metrics import accuracy_score
#accuracy_score([rmse_test])

# Testing model on whole test set


In [None]:
#Make changes to the test dataset as was done with the training dataset.

In [281]:
#Prepare Test Data to be tested- Merge with Riders information
testing = Test.merge(Riders, how='left', on='Rider Id')

#Drop unneeded columns
testing = testing.drop(['Vehicle Type'], axis=1)
testing = testing.drop(['User Id'], axis=1)

#Set All columns as independant variables, Test Dataset has no Y-VAlues.
X2 = testing.iloc[:, :].values

# Distance/Temperature - Replace Null values with Column Mean
from sklearn.impute import SimpleImputer
imp_temp1 = SimpleImputer(missing_values = np.nan, strategy='mean')
imp_temp1.fit(testing.iloc[:, 15:17]) 
testing.iloc[:, 15:17] = imp_temp1.transform(testing.iloc[:, 15:17])

#Precipitation(ml) -replace null with ZERO
from sklearn.impute import SimpleImputer
imp_prec1 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 0)
imp_prec1.fit(testing.iloc[:,17:]) 
testing.iloc[:,17:] = imp_prec1.transform(testing.iloc[:,17:])

#Change datatypes
testing['Placement - Time'] = pd.to_datetime(testing['Placement - Time'])
testing['Confirmation - Time'] = pd.to_datetime(testing['Confirmation - Time'])
testing['Arrival at Pickup - Time'] = pd.to_datetime(testing['Arrival at Pickup - Time'])
testing['Pickup - Time'] = pd.to_datetime(testing['Pickup - Time'])

# 1 HOT Encode - Personal or Business
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')
X2 = np.array(ct1.fit_transform(X2))

In [282]:
#testing.head(2)

In [286]:
#TESTING MODEL ON TEST DATA(CSV)

#factors that will help predict Time from Pickup to Arrival
desired_factors = ['Distance (KM)','Temperature','Precipitation in millimeters'
                   ,'No_Of_Orders', 'Average_Rating']

#set my model to MLR/DecisionTree/R.Forest
model = mlr


#set prediction data to factors that will predict, and set target to (time Arrival from Pick up to Arrival)
train_data = total_model[desired_factors]
test_data = testing[desired_factors]
target = total_model['Time from Pickup to Arrival']

#fitting model with prediction data and telling it my target
model.fit(train_data, target)

results = model.predict(test_data)
df = pd.DataFrame (results, columns = ['Time from Pickup to Arrival'])
df = df['Time from Pickup to Arrival'].round(decimals=2)
df = pd.DataFrame(df)
df.head()

Unnamed: 0,Time from Pickup to Arrival
0,1473.68
1,1126.61
2,1160.15
3,1156.03
4,1076.54


In [287]:
new = pd.DataFrame([testing['Order No'], df['Time from Pickup to Arrival']]).transpose()
new

Unnamed: 0,Order No,Time from Pickup to Arrival
0,Order_No_19248,1473.68
1,Order_No_12736,1126.61
2,Order_No_768,1160.15
3,Order_No_15332,1156.03
4,Order_No_21373,1076.54
...,...,...
7063,Order_No_3612,1158.47
7064,Order_No_7657,2648.49
7065,Order_No_1969,1621.81
7066,Order_No_10591,2468.04


In [267]:
new.to_csv(r'C:\Users\confi\Documents\Submissions.csv', index=None)