In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import joblib
from geopy.distance import distance
from sklearn.model_selection import train_test_split
import datetime
import matplotlib

from sklearn.preprocessing import StandardScaler

from regression import Regression

In [2]:
random_dataset = joblib.load('data/random_dataset.joblib')

## Data Processing & Analysis

In [3]:
date_time_format = '%Y-%m-%d %H:%M:%S UTC'
time_format = '%H'

def getDistance(pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude):
    # Define the two points
    pickup = (pickup_latitude, pickup_longitude)
    dropoff = (dropoff_latitude, dropoff_longitude)
    
    #Return distance in meters
    return distance(pickup, dropoff).kilometers  
    
def datetime2HoursOnly(date_time):
    date = datetime.datetime.strptime(date_time, date_time_format)
    return date.strftime(time_format)
    
def valid_row(longitude, latitude):
    return (longitude < -70) and (longitude > -78) and (latitude > 38) and (latitude < 45)

In [4]:
# Cleaning Up Data: Common Errors are:
# - Lat & Long being Switched
# - Decimal Place of Lat / Long off by a factor of 10
# - 0000 as Lat or Long (Removed)
random_dataset["valid"] = random_dataset.apply(lambda row : valid_row(row['pickup_longitude'],
       row['pickup_latitude']) and valid_row(row['dropoff_longitude'], row['dropoff_latitude']), axis=1 )

dataset = random_dataset[random_dataset.valid == True].drop(columns=["valid"], inplace=False)

In [5]:
dataset['distance'] = dataset.apply(lambda row : getDistance(row['pickup_longitude'],
       row['pickup_latitude'], row['dropoff_longitude'], row['dropoff_latitude']), axis=1)

dataset['time'] = dataset.apply(lambda row : datetime2HoursOnly(row['pickup_datetime']), axis=1)
dataset.drop(columns=['pickup_datetime'], inplace=True)

dataset = dataset[dataset.distance > 0]
dataset = dataset[dataset.passenger_count > 0]

In [6]:
#Basis Analysis: Distribution of Passenger Count, distance pickup time to fare amount
dataset.plot.scatter(x="distance", y="fare_amount")

<matplotlib.axes._subplots.AxesSubplot at 0x1a17c285c0>

In [7]:
dataset[['passenger_count', 'fare_amount']].groupby('passenger_count').mean().reset_index()

Unnamed: 0,passenger_count,fare_amount
0,1,11.279469
1,2,11.820995
2,3,10.932118
3,4,13.0875
4,5,12.086203
5,6,10.016277


## Model Selection

In [8]:
X=dataset.iloc[:,2:9]
y=dataset['fare_amount']

__Linear Regression__

In [9]:
model = Regression(X=X, y=y, model_type="linear_regression", normalize=True)
model.fit(model.X_train, model.y_train)
y_pred = model.predict(model.X_test)

train_result = model.validate_prediction(model.y_train, model.predict(model.X_train))
test_result = model.validate_prediction(model.y_test, y_pred)

print("Training_Set {}".format(train_result))
print("Test_Set {}".format(test_result))

Training_Set mse = 51.01912162938214 & mae = 3.4893914622145608 & rmse = 7.142767084917591
Test_Set mse = 31.195618639704144 & mae = 3.213952241687233 & rmse = 5.585303809078262


__Stochastic Gradient Descent__

In [10]:
model = Regression(X=X, y=y, model_type="stochastic_gradient_descent", normalize=True)
model.fit(model.X_train, model.y_train)

y_pred = model.predict(model.X_test)
train_result = model.validate_prediction(model.y_train, model.predict(model.X_train))
test_result = model.validate_prediction(model.y_test, y_pred)

print("Training_Set {}".format(train_result))
print("Test_Set {}".format(test_result))

Training_Set mse = 58.31019264862153 & mae = 4.261747263935154 & rmse = 7.636111094570425
Test_Set mse = 47.020963029662916 & mae = 4.023059786064784 & rmse = 6.857183316031657


__SVR__

In [11]:
model = Regression(X=X, y=y, model_type="svr", normalize=True)
model.fit(model.X_train, model.y_train)

y_pred = model.predict(model.X_test)
train_result = model.validate_prediction(model.y_train, model.predict(model.X_train))
test_result = model.validate_prediction(model.y_test, y_pred)

print("Training_Set {}".format(train_result))
print("Test_Set {}".format(test_result))

Training_Set mse = 37.128493406794576 & mae = 2.6023862341407553 & rmse = 6.093315469167387
Test_Set mse = 32.03180733030998 & mae = 2.57193630050366 & rmse = 5.659664948591036


__Decision Tree Regression__

In [12]:
model = Regression(X=X, y=y, model_type="decision_tree", normalize=True)
model.fit(model.X_train, model.y_train)

y_pred = model.predict(model.X_test)
train_result = model.validate_prediction(model.y_train, model.predict(model.X_train))
test_result = model.validate_prediction(model.y_test, y_pred)

print("Training_Set {}".format(train_result))
print("Test_Set {}".format(test_result))

Training_Set mse = 1.5778675718805345e-31 & mae = 2.9376189417280564e-17 & rmse = 3.972238124635197e-16
Test_Set mse = 27.93438234280792 & mae = 2.9468992248062014 & rmse = 5.285298699487846
