## Data Preprocessing

### Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import rc
import seaborn as sns

### Import Data


In [None]:
train_set = pd.read_csv('Train.csv')
unseen_set = pd.read_csv('Test.csv')
riders_set = pd.read_csv('Riders.csv')

In [None]:
# Store columns present in Train.csv but not in Test.csv
# To be used later as dependent variable vectors

dep_var = []

for i in train_set.columns.tolist():
    if i in unseen_set.columns.tolist():
        pass
    else:
        dep_var.append(i)
        
dep_var

In [None]:
train_set.head()

In [None]:
# Set Order No column as index
train_set = train_set.set_index('Order No')
train_set.head()

## Taking care of missing data

In [None]:
# Replace NaNs in Temperature and Precipitation with column specific mean

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(train_set.iloc[:, 20:22])
train_set.iloc[:, 20:22] = imputer.transform(train_set.iloc[:, 20:22])

train_set.head()

## Dealing with categorical data

In [None]:
# Dummy encode catergorical values in 'Vehicle Type' and 'Personal or Business' columns

train_set = pd.get_dummies(train_set, columns = ['Personal or Business', 'Vehicle Type'], drop_first=True)
train_set.head()

In [None]:
# Convert string of time into 24 hour datetime and extract hour variable

train_set['Placement - Time'] = pd.to_datetime(train_set['Placement - Time']).dt.strftime('%H')#:%M:%S')
train_set['Confirmation - Time'] = pd.to_datetime(train_set['Confirmation - Time']).dt.strftime('%H')#:%M:%S')
train_set['Arrival at Pickup - Time'] = pd.to_datetime(train_set['Arrival at Pickup - Time']).dt.strftime('%H')#:%M:%S')
train_set['Pickup - Time'] = pd.to_datetime(train_set['Pickup - Time']).dt.strftime('%H')#:%M:%S')
train_set['Arrival at Destination - Time'] = pd.to_datetime(train_set['Arrival at Destination - Time']).dt.strftime('%H')#:%M:%S')

train_set.head()

In [None]:
# Extract only int value from 'User Id' string

train_set['User Id'] = [train_set['User Id'][i][8:] for i in range(len(train_set))]

In [None]:
# Extract only int value from 'Rider Id' string

train_set['Rider Id'] = [train_set['Rider Id'][i][9:] for i in range(len(train_set))]

In [None]:
train_set.head()

In [None]:
# Reordering dataframe for seperation into independent and dependent variables

train_set = train_set[[c for c in train_set if c not in dep_var] 
       + dep_var]
train_set.head()

In [None]:
# Storing data as variables

X = train_set.iloc[:, :-4].values
y = train_set.iloc[:, -4:].values

## Splitting the dataset into the Training set and Validation set

In [None]:
# Creating a validation set from the data

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 1)

## Training the Multiple Linear Regression model on the Training set

In [None]:
# Training model on training set

from sklearn.linear_model import LinearRegression

regressor = LinearRegression()

regressor.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
# Storing predicted dependent variable results from set of validation features

y_pred = regressor.predict(X_val)

In [None]:
y_pred

In [None]:
y_val

In [None]:
# Displaying Root Mean Squared Error comparing actual values to the predicted values

from sklearn import metrics
from math import sqrt

print('RMSE:', sqrt(metrics.mean_squared_error(y_val[:,3], y_pred[:,3])))

## Predicting the unseen data results

In [None]:
# Storing unchanged unseen data in new variable

unseen_set = pd.read_csv('Test.csv')

In [None]:
# Performing identical transformations on relevant columns in unseen data to have comparable data as features

unseen_set = unseen_set.set_index('Order No')

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(unseen_set.iloc[:,17:19])
unseen_set.iloc[:, 17:19] = imputer.transform(unseen_set.iloc[:, 17:19])

unseen_set = pd.get_dummies(unseen_set, columns = ['Personal or Business', 'Vehicle Type'], drop_first=True)

unseen_set['Placement - Time'] = pd.to_datetime(unseen_set['Placement - Time']).dt.strftime('%H')#:%M:%S')
unseen_set['Confirmation - Time'] = pd.to_datetime(unseen_set['Confirmation - Time']).dt.strftime('%H')#:%M:%S')
unseen_set['Arrival at Pickup - Time'] = pd.to_datetime(unseen_set['Arrival at Pickup - Time']).dt.strftime('%H')#:%M:%S')
unseen_set['Pickup - Time'] = pd.to_datetime(unseen_set['Pickup - Time']).dt.strftime('%H')#:%M:%S')

unseen_set['User Id'] = [unseen_set['User Id'][i][8:] for i in range(len(unseen_set))]

unseen_set['Rider Id'] = [unseen_set['Rider Id'][i][9:] for i in range(len(unseen_set))]

unseen_set.head()

In [None]:
# Storing all columns as features

X_test = unseen_set.values

In [None]:
# Storing predicted dependant variables from predictions made from modelling the unseen data

sub_pred = regressor.predict(X_test)

In [None]:
sub_pred

In [None]:
# Displaying predicted variables as part of dataframe

unseen_set['Arrival at Destination - Day of Month'] = sub_pred[:,0]
unseen_set['Arrival at Destination - Weekday (Mo = 1)'] = sub_pred[:,1]
unseen_set['Arrival at Destination - Time'] = sub_pred[:,2]
unseen_set['Time from Pickup to Arrival'] = sub_pred[:,3]
            
unseen_set.head()

## Exporting data for submission

In [None]:
# Storing last column into new variable and displaying

submission = unseen_set['Time from Pickup to Arrival']
submission

In [None]:
# Saving new dataframe as csv

submission.to_csv('submission.csv')