In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from random import randrange, uniform
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [None]:
#set the working directory depending on where the csv file is stored
#os.getcwd()

In [None]:
training_data = pd.read_csv("train_cab.csv", encoding="ISO-8859-1")
training_data = training_data.replace(["", " ", "NAN"], np.NAN)

In [None]:
testing_data = pd.read_csv("test.csv", encoding="ISO-8859-1")

In [None]:
# Create a data frame with missing values
missing_val = pd.DataFrame(training_data.isnull().sum())

In [None]:
# reset the index
missing_val = missing_val.reset_index()

In [None]:
# rename variables
missing_val = missing_val.rename(columns = {'index' : 'variables', 0: 'Missing_percentage'})

In [None]:
# Calculate percentage
missing_val['Missing_percentage'] = (missing_val['Missing_percentage']/len(training_data))*100

In [None]:
# descending order
missing_val = missing_val.sort_values('Missing_percentage', ascending = False).reset_index(drop = True)

In [None]:
#save output results
missing_val.to_csv("Missing_perc.csv", index = False)
missing_val

In [None]:
# Laying a framework to choose best method for imputing missing value
# Record the data
# Actual Value = 40.71
# median       = 40.75
# mean         = 39.91

In [None]:
# create a missing value to choose the best method for imputation of missing values
training_data['pickup_latitude'].loc[2] = np.NAN

In [None]:
#Imputation with mean method
training_data['pickup_latitude'] = training_data['pickup_latitude'].fillna(training_data['pickup_latitude'].mean())

In [None]:
# Imputation with median method
training_data['pickup_latitude'] = training_data['pickup_latitude'].fillna(training_data['pickup_latitude'].median())

In [None]:
# Correcting the created missing value with actual value
training_data['pickup_latitude'].loc[2] = 40.71

In [None]:
# Only two variables 'passenger_count' and 'fare_amount' has missing value
# Median method is very close to the actual value, hence median method is chosen to impute missing value
training_data["passenger_count"] = training_data["passenger_count"].fillna(training_data["passenger_count"].median())

In [None]:
# Since "fare_amount" is of object type it cannot be imputed using mean or median
training_data['fare_amount'] = training_data['fare_amount'].fillna(training_data['fare_amount'].value_counts().index[0])

In [None]:
# recheck for the missing values
missing_val_new = pd.DataFrame(training_data.isnull().sum())

In [None]:
# Outlier Analysis using boxplot method
# first store the data in another variable 
df = training_data.copy()

In [None]:
# Plot boxplot visualise outliers
%matplotlib inline
# boxplot can be plotted for individual variables as shown below
plt.boxplot(training_data["dropoff_latitude"])

In [None]:
# Save numeric names
cnames = ["pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude", "passenger_count"]

In [None]:
# detect and delete outliers from data
for i in cnames:
    print(i)
    q75, q25 = np.percentile(training_data.loc[:,i], [75,25])
    iqr = q75 - q25
    
    min = q25 - (1.5*iqr)
    max = q75 + (1.5*iqr)
    print(min)
    print(max)
    
    training_data = training_data.drop(training_data[training_data.loc[:,i] < min].index)
    training_data = training_data.drop(training_data[training_data.loc[:,i] > max].index)

In [None]:
# After outliers are removed the number of observations dropped from 16067 to 9457 using boxplot


In [None]:
### Preparing model to check for collinearity among the variables and to train the model
# Performing correlation Analysis
num_corr = training_data.loc[:,cnames]

In [None]:
# Set the width and height of the plot
h, wd = plt.subplots(figsize=(7,5))

# Generate correlation Matrix
corr = num_corr.corr()

# Plot using seaborn library
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True), 
            square=True, ax=wd)

In [None]:
# Passenger_count is not correltaed to any one of the variable

In [None]:
# Chi-square test of independence
chi2, p , dof, ex = chi2_contingency(pd.crosstab(training_data["fare_amount"],training_data["pickup_datetime"]))
print(p)

In [None]:
# Since 'p' value is greater than 0.05 we approve the null hypothesis i.e it is not correlated to the dependent variable
# and exclude "pick_datetime" variable

In [None]:
# dimension reduction i.e deleting variables with are not correlated to target variable and have no impact on output
training_data_deleted = training_data.drop(["pickup_datetime", "passenger_count"], axis = 1)

In [None]:
## Normality Check
%matplotlib inline
plt.hist(training_data["pickup_longitude"],bins = "auto")

In [None]:
# From the histogram we decide to scale the data using standardisation
# Save numeric names
cnames = ["pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude"]
cnames

In [None]:
# Scaling the data using standardisation
for i in cnames:
    print(i)
    training_data_deleted[i] = (training_data_deleted[i] - training_data_deleted[i].mean())/training_data_deleted[i].std()

In [None]:
# Using Decision Tree for training and testing data
# dividing into train and test data by 80-20
train, test = train_test_split(training_data_deleted, test_size = 0.2)

In [None]:
# Decision tree regression
fit_DT = DecisionTreeRegressor(max_depth = 2).fit(train.iloc[:,2:5],train.iloc[:,1])

In [None]:
# Apply model on test data
predictions_DT = fit_DT.predict(test.iloc[:,2:5])

In [None]:
# Calculation of Error
def MAPE(y_true, y_pred):
    mape = np.mean(np.abs((y_true - y_pred)/y_true))
    return mape

In [None]:
MAPE(test.iloc[:,1], predictions_DT)

In [None]:
# Error rate = 4.09%
# Accuracy = 95.91%