## Importing the libraries ##
- We going to use the numpy libraries to use numpy arrays 
- We going to use pandas to load, merge  and modify our dataset
- matplotlib and seaborn libraries are going to be used to plot the model 

In [7]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D



## Loading the dataset ##

- **Pandas** is used to load the datafiles into our workspace
- four datafiles were loaded named Riders, Test,Train as well as VariableDefinitions


In [2]:
riders = pd.read_csv("./regression data/Riders.csv")
test = pd.read_csv("./regression data/Test.csv")
train = pd.read_csv("./regression data/Train.csv")
variableDefinitions= pd.read_csv("./regression data/VariableDefinitions.csv")



train.head(2)

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Arrival at Destination - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,...,10:39:55 AM,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,...,12:17:22 PM,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993


## Data Preprocessing ##
 - Some columns needs to dropped
 - train test and riders needs to be merged 
 - nulls values needs to be dealt with

In [3]:
# Cleaning the data

# Allignment of Dataset

train = train[['Order No', 'User Id', 'Vehicle Type', 'Platform Type',
       'Personal or Business', 'Placement - Day of Month',
       'Placement - Weekday (Mo = 1)', 'Placement - Time',
       'Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)',
       'Confirmation - Time', 'Arrival at Pickup - Day of Month',
       'Arrival at Pickup - Weekday (Mo = 1)', 'Arrival at Pickup - Time',
       'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)', 'Pickup - Time',
       'Distance (KM)', 'Temperature', 'Precipitation in millimeters',
       'Pickup Lat', 'Pickup Long', 'Destination Lat', 'Destination Long',
       'Rider Id']]
       


# check which data type we are dealing with
train.dtypes 
test.dtypes



train.head()
    







Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Pickup - Weekday (Mo = 1),Pickup - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,...,5,10:27:30 AM,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,...,5,11:44:09 AM,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856
2,Order_No_1899,User_Id_265,Bike,3,Business,30,2,12:39:25 PM,30,2,...,2,12:53:03 PM,3,,,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155
3,Order_No_9336,User_Id_1402,Bike,3,Business,15,5,9:25:34 AM,15,5,...,5,9:43:06 AM,9,19.2,,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855
4,Order_No_27883,User_Id_1737,Bike,1,Personal,13,1,9:55:18 AM,13,1,...,1,10:05:23 AM,9,15.4,,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770


## Check for Corellated predictors ( x variables) ##

In [4]:
train.corr()
test.corr()
# dropping highly correlated predictors
train = train.drop(['Placement - Weekday (Mo = 1)', 'Placement - Weekday (Mo = 1)','Confirmation - Day of Month','Confirmation - Weekday (Mo = 1)','Arrival at Pickup - Day of Month','Arrival at Pickup - Weekday (Mo = 1)','Pickup - Day of Month','Pickup - Weekday (Mo = 1)'], axis = 1)

test = test.drop(['Placement - Weekday (Mo = 1)', 'Placement - Weekday (Mo = 1)','Confirmation - Day of Month','Confirmation - Weekday (Mo = 1)','Arrival at Pickup - Day of Month','Arrival at Pickup - Weekday (Mo = 1)','Pickup - Day of Month','Pickup - Weekday (Mo = 1)'], axis = 1)

#dropping the irrelevant columns 
train = train.drop(['Order No', 'User Id','Vehicle Type','Rider Id','Placement - Time', 'Confirmation - Time', 'Arrival at Pickup - Time', 'Pickup - Time'], axis = 1)

test = test.drop(['Order No', 'User Id','Vehicle Type','Rider Id','Placement - Time', 'Confirmation - Time', 'Arrival at Pickup - Time', 'Pickup - Time'], axis = 1)


test.head()



Unnamed: 0,Platform Type,Personal or Business,Placement - Day of Month,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long
0,3,Business,27,8,,,-1.333275,36.870815,-1.305249,36.82239
1,3,Business,17,5,,,-1.272639,36.794723,-1.277007,36.823907
2,3,Business,27,5,22.8,,-1.290894,36.822971,-1.276574,36.851365
3,3,Business,17,5,24.5,,-1.290503,36.809646,-1.303382,36.790658
4,3,Business,11,6,24.4,,-1.281081,36.814423,-1.266467,36.792161



## Encoding the categorical data ##
 - Bussiness column needs to be encoded into dummy variables  so i can be of type int 


In [6]:
test = pd.get_dummies(test)
train = pd.get_dummies(train)

# filling the missing data

train = train.fillna(train.mean())
test =  test.fillna(test.mean())

test.head()

Unnamed: 0,Platform Type,Placement - Day of Month,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Personal or Business_Business,Personal or Business_Personal
0,3,27,8,23.24612,6.651759,-1.333275,36.870815,-1.305249,36.82239,1,0
1,3,17,5,23.24612,6.651759,-1.272639,36.794723,-1.277007,36.823907,1,0
2,3,27,5,22.8,6.651759,-1.290894,36.822971,-1.276574,36.851365,1,0
3,3,17,5,24.5,6.651759,-1.290503,36.809646,-1.303382,36.790658,1,0
4,3,11,6,24.4,6.651759,-1.281081,36.814423,-1.266467,36.792161,1,0


## Feature Scaling ##

## Creating y and x metrics ##

## Spliting Data into the training and the test set ##

## Fitting the multivariate Regression model ##

## Assesing model accuracy ##