In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

### Dataset

In [50]:
# Dataset
rides  = pd.read_csv('trips_data.csv')
rides.head()

Unnamed: 0,City,Product Type,Trip or Order Status,Request Time,Begin Trip Time,Begin Trip Lat,Begin Trip Lng,Dropoff Time,Dropoff Lat,Dropoff Lng,Distance (miles),Fare Amount,Fare Currency
0,458,Black,COMPLETED,2021-01-13 22:06:46 +0000 UTC,2021-01-13 22:11:10 +0000 UTC,-23.599752,-46.715954,2021-01-13 22:29:13 +0000 UTC,-23.559359,-46.666542,4.84,29.63,BRL
1,458,VIP,COMPLETED,2021-01-13 20:21:05 +0000 UTC,2021-01-13 20:27:29 +0000 UTC,-23.559298,-46.666454,2021-01-13 20:54:50 +0000 UTC,-23.599585,-46.715717,5.31,20.86,BRL
2,458,VIP,COMPLETED,2021-01-03 00:23:22 +0000 UTC,2021-01-03 00:33:00 +0000 UTC,-23.626593,-46.65941,2021-01-03 00:50:56 +0000 UTC,-23.559273,-46.666595,5.9,34.23,BRL
3,458,VIP,COMPLETED,2020-12-11 23:16:33 +0000 UTC,2020-12-11 23:27:32 +0000 UTC,-23.55941,-46.666435,2020-12-12 00:32:47 +0000 UTC,-23.425755,-46.48119,19.74,58.77,BRL
4,458,VIP,COMPLETED,2020-12-11 00:35:46 +0000 UTC,2020-12-11 00:46:46 +0000 UTC,-23.584846,-46.66359,2020-12-11 00:57:57 +0000 UTC,-23.559261,-46.66651,2.54,12.63,BRL


In [51]:
# Info
rides.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554 entries, 0 to 553
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   City                  554 non-null    int64  
 1   Product Type          551 non-null    object 
 2   Trip or Order Status  554 non-null    object 
 3   Request Time          554 non-null    object 
 4   Begin Trip Time       554 non-null    object 
 5   Begin Trip Lat        525 non-null    float64
 6   Begin Trip Lng        525 non-null    float64
 7   Dropoff Time          554 non-null    object 
 8   Dropoff Lat           525 non-null    float64
 9   Dropoff Lng           525 non-null    float64
 10  Distance (miles)      554 non-null    float64
 11  Fare Amount           554 non-null    float64
 12  Fare Currency         551 non-null    object 
dtypes: float64(6), int64(1), object(6)
memory usage: 56.4+ KB


In [52]:
# Checking categories in product_type column
print(rides['Product Type'].value_counts())
# Categories reclassification
product_mapping = {'UberX':'UberX','uberX':'UberX','uberX VIP':'UberX','VIP':'UberX','POOL':'Pool','POOL: MATCHED':'Pool','UberBLACK': 'Black',
                   'uberx':'UberX','uberPOOL':'Pool','uberPOOL: MATCHED':'Pool','Pool: MATCHED':'Pool'}
# New categories replacement
rides['Product Type'].replace(product_mapping, inplace=True)
# Checking new categories in product_type column
print(rides['Product Type'].value_counts())

UberX                   169
uberX                   144
uberX VIP                81
VIP                      71
POOL                     36
Pool                     12
UberEATS Marketplace     10
POOL: MATCHED             8
uberPOOL: MATCHED         5
uberPOOL                  5
Pool: MATCHED             4
Black                     3
Juntos                    1
UberBLACK                 1
uberx                     1
Name: Product Type, dtype: int64
UberX                   466
Pool                     70
UberEATS Marketplace     10
Black                     4
Juntos                    1
Name: Product Type, dtype: int64


In [53]:
# Since most of these reviews are only around Uber rides, I have removed the UberEATS records from my database.
rides = rides[rides['Product Type']!='UberEATS Marketplace']

In [54]:
# Library for manipulating dates and times
from datetime import datetime
from datetime import timedelta
# Function to convert features to datetime
def date_convertion(df, cols):
    for col in cols:
        df[col] = df[col].apply(lambda x: x.replace(' +0000 UTC', ''))
        df[col] = pd.to_datetime(df[col])
    return df
# Applying date_convertion function to date features 
rides = date_convertion(rides, ['Request Time', 'Begin Trip Time', 'Dropoff Time'])

In [56]:
X = rides.drop(['Fare Amount','Fare Currency'],axis=1) # Independent
y = rides['Fare Amount'] #Dependent

In [59]:
# Categorical Encoding
le = LabelEncoder()
categorical_features = [col for col in X.columns if X[col].dtype == 'O']
X[categorical_features] = X[categorical_features].apply(le.fit_transform)

In [63]:
# metrics
def evaluate_clf(true, predicted):
    '''
    This function takes in true values and predicted values
    Returns: Accuracy, F1-Score, Precision, Recall, Roc-auc Score
    '''
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    r2 = r2_score(true,predicted)
    return mae,mse,r2

In [64]:
# Create a function which can evaluate models and return a report 
def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    
    models_list = []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Training set performance
        model_train_mae, model_train_mse, model_train_r2=evaluate_clf(y_train ,y_train_pred)


        # Test set performance
        model_test_mae, model_test_mse, model_test_r2 =evaluate_clf(y_test, y_test_pred)


        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print('- MAE : {:.4f}'.format(model_train_mae)) 
        print('- MSE: {:.4f}'.format(model_train_mse))
        print('- r2: {:.4f}'.format(model_train_r2))

        print('----------------------------------')

        print('Model performance for Test set')
        print('- MAE : {:.4f}'.format(model_test_mae)) 
        print('- MSE: {:.4f}'.format(model_test_mse))
        print('- r2: {:.4f}'.format(model_test_r2))
        print('='*35)
        print('\n')

In [65]:
models = {
        "DecisionTreeRegressor":DecisionTreeRegressor(),
        "KNN": KNeighborsRegressor(),
        "Linear":LinearRegression(),
        "RandomForest":RandomForestRegressor(),
        "SVR":SVR()
}

In [72]:
# results = evaluate_models(X,y,models)