## Importing the libraries ##
- We going to use the numpy libraries to use numpy arrays 
- We going to use pandas to load, merge  and modify our dataset
- matplotlib and seaborn libraries are going to be used to plot the model 

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import pearsonr



## Loading the dataset ##

- **Pandas** is used to load the datafiles into our workspace
- four datafiles were loaded named Riders, Test,Train as well as VariableDefinitions


In [2]:
riders = pd.read_csv("./regression data/Riders.csv")
test = pd.read_csv("./regression data/Test.csv")
train = pd.read_csv("./regression data/Train.csv")
variableDefinitions= pd.read_csv("./regression data/VariableDefinitions.csv")

# flling in missing data

train['Temperature'] = train['Temperature'].fillna( train['Temperature'].mean())
train = train.fillna(0)

test['Temperature'] = test['Temperature'].fillna(test['Temperature'].mean())
test = test.fillna(0)



train.info()


&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 21201 entries, 0 to 21200
Data columns (total 29 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Order No                                   21201 non-null  object 
 1   User Id                                    21201 non-null  object 
 2   Vehicle Type                               21201 non-null  object 
 3   Platform Type                              21201 non-null  int64  
 4   Personal or Business                       21201 non-null  object 
 5   Placement - Day of Month                   21201 non-null  int64  
 6   Placement - Weekday (Mo = 1)               21201 non-null  int64  
 7   Placement - Time                           21201 non-null  object 
 8   Confirmation - Day of Month                21201 non-null  int64  
 9   Confirmation - Weekday (Mo = 1)            21201 non-null  int64  
 10  Confirma

## Data Preprocessing ##
 - Some columns needs to dropped
 - train test and riders needs to be merged 
 - nulls values needs to be dealt with

In [3]:
# Cleaning the data


# Allignment of Dataset

train = train[['Order No', 'User Id', 'Vehicle Type', 'Platform Type',
       'Personal or Business', 'Placement - Day of Month',
       'Placement - Weekday (Mo = 1)', 'Placement - Time',
       'Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
       'Confirmation - Time', 'Arrival at Pickup - Day of Month',
       'Arrival at Pickup - Weekday (Mo = 1)', 'Arrival at Pickup - Time',
       'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)', 'Pickup - Time',
       'Distance (KM)', 'Temperature', 'Precipitation in millimeters',
       'Pickup Lat', 'Pickup Long', 'Destination Lat', 'Destination Long',
       'Rider Id','Time from Pickup to Arrival']]
       


# check which data type we are dealing with
train.dtypes 
test.dtypes



train.head()
    







Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Pickup - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,...,10:27:30 AM,4,20.4,0.0,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,...,11:44:09 AM,16,26.4,0.0,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993
2,Order_No_1899,User_Id_265,Bike,3,Business,30,2,12:39:25 PM,30,2,...,12:53:03 PM,3,23.258889,0.0,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455
3,Order_No_9336,User_Id_1402,Bike,3,Business,15,5,9:25:34 AM,15,5,...,9:43:06 AM,9,19.2,0.0,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341
4,Order_No_27883,User_Id_1737,Bike,1,Personal,13,1,9:55:18 AM,13,1,...,10:05:23 AM,9,15.4,0.0,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214


## Variable Selection by correlation and significance ##

- We have more predictor variables to choose from, so we need a way of guiding us to choose the best ones to be our predictors. One way is to look at the correlations between the Time from Pickup to Arrival and each variables in our DataFrame and select those with the strongest correlations (both positive and negative).

- We also need to consider how significant those features are.

- The code below will create a new DataFrame and store the correlation coefficents and p-values in that DataFrame

In [5]:
# Calculate correlations between predictor variables and the response variable
corrs = train.corr()['Time from Pickup to Arrival'].sort_values(ascending=False)
corrs

Time from Pickup to Arrival             1.000000
Distance (KM)                           0.580608
Destination Long                        0.070425
Pickup Long                             0.060285
Confirmation - Weekday (Mo = 1)         0.009744
Arrival at Pickup - Weekday (Mo = 1)    0.009744
Pickup - Weekday (Mo = 1)               0.009744
Placement - Weekday (Mo = 1)            0.009693
Temperature                             0.005772
Precipitation in millimeters            0.005495
Platform Type                          -0.003827
Pickup - Day of Month                  -0.014701
Arrival at Pickup - Day of Month       -0.014701
Confirmation - Day of Month            -0.014701
Placement - Day of Month               -0.014710
Pickup Lat                             -0.053823
Destination Lat                        -0.061872
Name: Time from Pickup to Arrival, dtype: float64

In [6]:

# Build a dictionary of correlation coefficients and p-values
dict_cp = {}

column_titles = [col for col in corrs.index if col!= 'Time from Pickup to Arrival']
for col in column_titles:
    p_val = round(pearsonr(train[col], train['Time from Pickup to Arrival'])[1],6)
    dict_cp[col] = {'Correlation_Coefficient':corrs[col],
                    'P_Value':p_val}

df_cp = pd.DataFrame(dict_cp).T
df_cp_sorted = df_cp.sort_values('P_Value')
df_cp_sorted[df_cp_sorted['P_Value']<0.1]

Unnamed: 0,Correlation_Coefficient,P_Value
Distance (KM),0.580608,0.0
Destination Long,0.070425,0.0
Pickup Long,0.060285,0.0
Pickup Lat,-0.053823,0.0
Destination Lat,-0.061872,0.0
Placement - Day of Month,-0.01471,0.032205
Pickup - Day of Month,-0.014701,0.032312
Arrival at Pickup - Day of Month,-0.014701,0.032312
Confirmation - Day of Month,-0.014701,0.032312


In [7]:

#dropping highly correlated predictors and the ones that were not selected above
train = train.drop(['Placement - Weekday (Mo = 1)', 'Placement - Weekday (Mo = 1)','Confirmation - Day of Month','Confirmation - Weekday (Mo = 1)','Arrival at Pickup - Day of Month','Arrival at Pickup - Weekday (Mo = 1)','Pickup - Day of Month','Pickup - Weekday (Mo = 1)'], axis = 1)

test = test.drop(['Placement - Weekday (Mo = 1)', 'Placement - Weekday (Mo = 1)','Confirmation - Day of Month','Confirmation - Weekday (Mo = 1)','Arrival at Pickup - Day of Month','Arrival at Pickup - Weekday (Mo = 1)','Pickup - Day of Month','Pickup - Weekday (Mo = 1)'], axis = 1)


#dropping the irrelevant columns 
train = train.drop(['User Id','Vehicle Type','Rider Id', 'Confirmation - Time', ], axis = 1)

test = test.drop(['User Id','Vehicle Type','Rider Id', 'Confirmation - Time'], axis = 1)


train.head()



Unnamed: 0,Order No,Platform Type,Personal or Business,Placement - Day of Month,Placement - Time,Arrival at Pickup - Time,Pickup - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Time from Pickup to Arrival
0,Order_No_4211,3,Business,9,9:35:46 AM,10:04:47 AM,10:27:30 AM,4,20.4,0.0,-1.317755,36.83037,-1.300406,36.829741,745
1,Order_No_25375,3,Personal,12,11:16:16 AM,11:40:22 AM,11:44:09 AM,16,26.4,0.0,-1.351453,36.899315,-1.295004,36.814358,1993
2,Order_No_1899,3,Business,30,12:39:25 PM,12:49:34 PM,12:53:03 PM,3,23.258889,0.0,-1.308284,36.843419,-1.300921,36.828195,455
3,Order_No_9336,3,Business,15,9:25:34 AM,9:37:56 AM,9:43:06 AM,9,19.2,0.0,-1.281301,36.832396,-1.257147,36.795063,1341
4,Order_No_27883,1,Personal,13,9:55:18 AM,10:03:53 AM,10:05:23 AM,9,15.4,0.0,-1.266597,36.792118,-1.295041,36.809817,1214



## Encoding the categorical data ##
 - Bussiness column needs to be encoded into dummy variables  so i can be of type int 


In [8]:

test = pd.get_dummies(test)
train = pd.get_dummies(train)

train.head()

Unnamed: 0,Platform Type,Placement - Day of Month,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Time from Pickup to Arrival,...,Pickup - Time_9:59:38 AM,Pickup - Time_9:59:39 AM,Pickup - Time_9:59:42 AM,Pickup - Time_9:59:46 AM,Pickup - Time_9:59:48 AM,Pickup - Time_9:59:51 AM,Pickup - Time_9:59:52 AM,Pickup - Time_9:59:53 AM,Pickup - Time_9:59:57 AM,Pickup - Time_9:59:59 AM
0,3,9,4,20.4,0.0,-1.317755,36.83037,-1.300406,36.829741,745,...,0,0,0,0,0,0,0,0,0,0
1,3,12,16,26.4,0.0,-1.351453,36.899315,-1.295004,36.814358,1993,...,0,0,0,0,0,0,0,0,0,0
2,3,30,3,23.258889,0.0,-1.308284,36.843419,-1.300921,36.828195,455,...,0,0,0,0,0,0,0,0,0,0
3,3,15,9,19.2,0.0,-1.281301,36.832396,-1.257147,36.795063,1341,...,0,0,0,0,0,0,0,0,0,0
4,1,13,9,15.4,0.0,-1.266597,36.792118,-1.295041,36.809817,1214,...,0,0,0,0,0,0,0,0,0,0


## Feature Scaling ##

## Creating y and x metrics ##

## Spliting Data into the training and the test set ##

## Fitting the multivariate Regression model ##

## Assesing model accuracy ##