In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pylab
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from math import radians, sin, cos, sqrt, asin
from IPython.display import Markdown, display
from tqdm import tqdm

from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge

plt.style.use('fivethirtyeight')

In [6]:
df = pd.read_csv('DS_Miniproject/UberDataset.csv')
# df.drop(['Unnamed: 0','key'], axis=1, inplace=True)
df.head()

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
0,01-01-2016 21:11,01-01-2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,01-02-2016 01:25,01-02-2016 01:37,Business,Fort Pierce,Fort Pierce,5.0,
2,01-02-2016 20:25,01-02-2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,01-05-2016 17:31,01-05-2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,01-06-2016 14:42,01-06-2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


In [3]:
df.isnull().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [4]:
df.duplicated().sum()

0

In [5]:
df.dropna(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199999 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   fare_amount        199999 non-null  float64            
 1   pickup_datetime    199999 non-null  datetime64[ns, UTC]
 2   pickup_longitude   199999 non-null  float64            
 3   pickup_latitude    199999 non-null  float64            
 4   dropoff_longitude  199999 non-null  float64            
 5   dropoff_latitude   199999 non-null  float64            
 6   passenger_count    199999 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(5), int64(1)
memory usage: 12.2 MB


#### This part of the code filters the DataFrame df to retain only those rows where the pickup and dropoff latitude and longitude values    are within valid geographical coordinates on Earth's surface. The conditions specified in the code ensure that:

- pickup_latitude is less than 90 and greater than -90 (valid latitude range).\
- dropoff_latitude is less than 90 and greater than -90 (valid latitude range).\
- pickup_longitude is less than 180 and greater than -180 (valid longitude range).\
- dropoff_longitude is less than 180 and greater than -180 (valid longitude range).


In [8]:
df = df[(df.pickup_latitude<90) & (df.dropoff_latitude<90) &
        (df.pickup_latitude>-90) & (df.dropoff_latitude>-90) &
        (df.pickup_longitude<180) & (df.dropoff_longitude<180) &
        (df.pickup_longitude>-180) & (df.dropoff_longitude>-180)]

In [10]:
df['year'] = df.pickup_datetime.dt.year
df['month'] = df.pickup_datetime.dt.month
df['weekday'] = df.pickup_datetime.dt.weekday
df['hour'] = df.pickup_datetime.dt.hour

df['Monthly_Quarter'] = df.month.map({1:'Q1',2:'Q1',3:'Q1',4:'Q2',5:'Q2',6:'Q2',7:'Q3',
                                      8:'Q3',9:'Q3',10:'Q4',11:'Q4',12:'Q4'})

df['Hourly_Segments'] = df.hour.map({0:'H1',1:'H1',2:'H1',3:'H1',4:'H2',5:'H2',6:'H2',7:'H2',8:'H3',
                                     9:'H3',10:'H3',11:'H3',12:'H4',13:'H4',14:'H4',15:'H4',16:'H5',
                                     17:'H5',18:'H5',19:'H5',20:'H6',21:'H6',22:'H6',23:'H6'})

In [11]:
df.drop(['pickup_datetime','month', 'hour',], axis=1, inplace=True)

### Haversine formula :

a = sin²(Δφ/2) + cos(φ1) * cos(φ2) * sin²(Δλ/2)\
c = 2 * atan2(√a, √(1-a))\
distance = R * c

- φ1,φ2 : latitudes of the two points (in radians).\
- Δφ : difference in latitude between the two points.\
- Δλ : difference in longitude between the two points.\
- R : radius of the Earth (approximately 6,371 kilometers).\

distance is the great-circle distance between the two points (also known as the "as-the-crow-flies" distance).
The geopy library, which is used in the provided code, encapsulates this Haversine formula and provides a convenient way to calculate distances between geographic coordinates accurately.\

Using the Haversine formula ensures that the calculated distances between pickup and dropoff coordinates are more accurate for real-world distances on the Earth's surface, especially over long distances or near the poles where the Earth's curvature becomes more significant. This makes it suitable for various applications, such as calculating travel distances, optimizing routes, or geospatial analysis.




In [13]:
def distance_transform(longitude1, latitude1, longitude2, latitude2):
    travel_dist = []
    
    for pos in range(len(longitude1)):
        long1,lati1,long2,lati2 = map(radians,[longitude1[pos],latitude1[pos],longitude2[pos],latitude2[pos]])
        dist_long = long2 - long1
        dist_lati = lati2 - lati1
        a = sin(dist_lati/2)**2 + cos(lati1) * cos(lati2) * sin(dist_long/2)**2
        c = 2 * asin(sqrt(a))*6371
        travel_dist.append(c)
       
    return travel_dist

In [14]:
df['Distance']=distance_transform(df['pickup_longitude'].to_numpy(),
                                                df['pickup_latitude'].to_numpy(),
                                                df['dropoff_longitude'].to_numpy(),
                                                df['dropoff_latitude'].to_numpy())
df['Distance'] = df['Distance'].round(3)

In [15]:
cat = df.select_dtypes(include='object')
num = df.select_dtypes(exclude='object')

print('\n\033[1mInference:\033[0m The Datset has {} numerical & {} categorical features.'.format(len(num.columns),len(cat.columns)))

***Categorical Variables Dataframe:***

Unnamed: 0,Monthly_Quarter,Hourly_Segments
0,Q2,H5
1,Q3,H6
2,Q3,H6
3,Q2,H3
4,Q3,H5


***Numeric Variables Dataframe:***

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekday,Distance
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,3,1.683
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,2009,4,2.458
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,2009,0,5.036
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,4,1.662
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,3,4.475


## Encoding Categorical Features

In [17]:
Hour_encoder = LabelEncoder()
Month_encoder = LabelEncoder()

df.Monthly_Quarter = Month_encoder.fit_transform(df.Monthly_Quarter)
df.Hourly_Segments = Hour_encoder.fit_transform(df.Hourly_Segments)

Hour_Segments_decoded = Hour_encoder.inverse_transform(df.Hourly_Segments)
print("Hour Segments Decoded Categories:", Hour_Segments_decoded)

Month_Segments_decoded = Month_encoder.inverse_transform(df.Monthly_Quarter)
print("Decoded Categories:", Month_Segments_decoded)

Hour Segments Decoded Categories: ['H5' 'H6' 'H6' ... 'H1' 'H4' 'H2']
Decoded Categories: ['Q2' 'Q3' 'Q3' ... 'Q2' 'Q2' 'Q2']


In [20]:
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekday,Monthly_Quarter,Hourly_Segments,Distance
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,3,1,4,1.683
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,2009,4,2,5,2.458
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,2009,0,2,5,5.036
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,4,1,2,1.662
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,3,2,4,4.475


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199987 entries, 0 to 199999
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        199987 non-null  float64
 1   pickup_longitude   199987 non-null  float64
 2   pickup_latitude    199987 non-null  float64
 3   dropoff_longitude  199987 non-null  float64
 4   dropoff_latitude   199987 non-null  float64
 5   passenger_count    199987 non-null  int64  
 6   year               199987 non-null  int64  
 7   weekday            199987 non-null  int64  
 8   Monthly_Quarter    199987 non-null  int64  
 9   Hourly_Segments    199987 non-null  int64  
 10  Distance           199987 non-null  float64
dtypes: float64(6), int64(5)
memory usage: 18.3 MB


In [22]:
counter = 0
rs,cs = df.shape

df.drop_duplicates(inplace=True)
df.drop(['pickup_latitude','pickup_longitude',
         'dropoff_latitude','dropoff_longitude'],axis=1)

if df.shape==(rs,cs):
    print('\n\033[1mInference:\033[0m The dataset doesn\'t have any duplicates')
else:
    print(f'\n\033[1mInference:\033[0m Number of duplicates dropped/fixed ---> {rs-df.shape[0]}')


[1mInference:[0m Number of duplicates dropped/fixed ---> 109


## Outliers Removal 

In [25]:
df[df.Distance > 50]

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekday,Monthly_Quarter,Hourly_Segments,Distance
346,15.5,0.000000,0.000000,-73.979805,40.786030,1,2015,3,0,4,8666.398
1067,52.0,-73.781095,40.645015,0.000000,0.000000,1,2014,6,0,5,8647.036
1526,2.5,-74.001849,40.715156,0.000000,0.000000,3,2014,0,1,3,8666.772
1945,7.0,-0.131667,40.757063,-73.991593,40.749953,1,2013,6,0,4,6021.114
2167,5.7,-1.216667,40.748597,-74.004822,40.734670,1,2012,5,2,3,5941.060
...,...,...,...,...,...,...,...,...,...,...,...
197863,7.0,-73.962190,40.759158,0.000000,0.000000,1,2014,1,3,5,8664.389
198567,23.5,-73.968115,40.801455,0.000000,0.000000,2,2013,0,3,0,8665.747
198665,20.1,-0.116667,40.729775,0.000000,0.000000,5,2012,1,1,5,4528.960
199403,7.0,-67.370360,39.999790,-73.971058,40.753000,1,2013,6,0,4,565.235


In [26]:
df.Distance.max()

8782.899

In [27]:
df.fare_amount.max()

499.0

In [30]:
len(df[df.Distance < 0.1])

6694

## Observations 

- Fare Prices in negatives and zeroes doesn't exist and 
- Distance less than 100 meters is not often used for a uber ride
- also with distance greater than 50 km is preferred to be Travelled by other means

In [31]:
df.drop(df[df.Distance > 50].index, inplace=True)
df.drop(df[df.Distance < 0.1].index, inplace=True)
df.drop(df[df.fare_amount <= 0].index, inplace=True)

In [32]:
df.shape

(192683, 11)

In [33]:
original_df = df.copy()

In [36]:
features = ['fare_amount', 'Distance'] 

for i in features:
    Q1 = df[i].quantile(0.25)
    Q3 = df[i].quantile(0.75)
    IQR = Q3 - Q1
    df = df[df[i] <= (Q3+(1.5*IQR))]
    df = df[df[i] >= (Q1-(1.5*IQR))]
    df = df.reset_index(drop=True)
display(df.head())
print('\n\033[1mInference:\033[0m\nBefore removal of outliers, The dataset had {} samples.'.format(original_df.shape[0]))
print('After removal of outliers, The dataset now has {} samples.'.format(df.shape[0]))


Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekday,Monthly_Quarter,Hourly_Segments,Distance
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,3,1,4,1.683
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,2009,4,2,5,2.458
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,2009,0,2,5,5.036
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,4,1,2,1.662
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,3,2,4,4.475



[1mInference:[0m
Before removal of outliers, The dataset had 192683 samples.
After removal of outliers, The dataset now has 168614 samples.


## Feature Selection

#### 1. Pearson's Correaltion feature Selection

- for (Numerical Input, Numerical Output) we use Pearson's Correaltion feature Selection Regression Feature Selection


In [38]:
features =  ['passenger_count', 'year',
             'weekday', 'Monthly_Quarter', 
             'Hourly_Segments', 'Distance']

target = 'fare_amount'

In [39]:
X, y = df[features], df[target]

In [40]:
selector = SelectKBest(score_func=f_regression, k=6)
X_new = selector.fit_transform(X, y)

selected_feature_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_feature_indices]

print("Selected features:\n")
print('\n'.join(map(str, selected_features)))


Selected features:

passenger_count
year
weekday
Monthly_Quarter
Hourly_Segments
Distance


### 2. Spearman's Correlation 

In [41]:
spearman_matrix = df.corr(method='spearman')
spearman_matrix

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekday,Monthly_Quarter,Hourly_Segments,Distance
fare_amount,1.0,-0.046834,-0.071199,-0.004209,-0.084333,0.023038,0.161716,0.011581,0.029703,0.013326,0.821086
pickup_longitude,-0.046834,1.0,0.672371,0.468595,0.406693,-0.020396,0.00055,-0.032446,-0.003389,-0.00629,-0.030582
pickup_latitude,-0.071199,0.672371,1.0,0.378644,0.597646,-0.01626,-0.003977,-0.056809,0.001528,0.014831,-0.050278
dropoff_longitude,-0.004209,0.468595,0.378644,1.0,0.621389,-0.020572,0.005963,-0.030866,-0.004523,-0.019282,0.03406
dropoff_latitude,-0.084333,0.406693,0.597646,0.621389,1.0,-0.014256,-0.001901,-0.05042,-0.000327,0.006539,-0.039207
passenger_count,0.023038,-0.020396,-0.01626,-0.020572,-0.014256,1.0,-0.014826,0.059133,0.008727,0.038809,0.010734
year,0.161716,0.00055,-0.003977,0.005963,-0.001901,-0.014826,1.0,0.006063,-0.111685,0.002263,-0.002844
weekday,0.011581,-0.032446,-0.056809,-0.030866,-0.05042,0.059133,0.006063,1.0,-0.009851,-0.062549,0.036143
Monthly_Quarter,0.029703,-0.003389,0.001528,-0.004523,-0.000327,0.008727,-0.111685,-0.009851,1.0,-0.002566,0.009963
Hourly_Segments,0.013326,-0.00629,0.014831,-0.019282,0.006539,0.038809,0.002263,-0.062549,-0.002566,1.0,0.010823


In [42]:
abs_corr_values = spearman_matrix.abs()
top_features = abs_corr_values.mean().sort_values().head(6)
top_features

Hourly_Segments    0.107025
Monthly_Quarter    0.107478
passenger_count    0.111523
year               0.119253
weekday            0.123260
Distance           0.185975
dtype: float64

### 1. Data Splitting (train_test_split), Data Scaling and Model Training

In [44]:
train, test = df[df.year < 2015], df[df.year == 2015]

X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

In [45]:
scalar = StandardScaler()

X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

X_train.shape

(157063, 6)

In [46]:
y_train.shape

(157063,)

In [47]:
split_models = {
    'Random_Forest_1': RandomForestRegressor(),
    'Decision_tree_1': DecisionTreeRegressor(),
    'XGBoost_1': XGBRegressor(),
}

In [48]:
def eval_model(y_true, y_pred):
    
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    return mae, mse, r2

In [49]:
train_test_results = {}

for model_name, model in tqdm(split_models.items(), desc='Training Models'):
    model.fit(X_train, y_train.ravel())

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    trained_data_mae, trained_data_mse, trained_data_r2 = eval_model(y_train, y_train_pred)
    test_data_mae, test_data_mse, test_data_r2 = eval_model(y_test, y_test_pred)

    train_test_results[model_name] = {
        'Train': {
            'Train_MAE': trained_data_mae,
            'Train_R2': trained_data_r2,
            'Train_mse': trained_data_mse
        },
        'Test': {
            'Test_MAE': test_data_mae,
            'Test_R2': test_data_r2,
            'Test_mse': test_data_mse
        }
    }

Training Models: 100%|██████████| 7/7 [01:17<00:00, 11.08s/it]


In [50]:
print('='*80)
print('{:<25s} {:<8s} {:<8s} {:<8s} {:<8s}'.format('Model', 'Train_MSE', 'Test_MSE', 'Train_R2', 'Test_R2'))
print('='*80)
for model_name, model_results in train_test_results.items():
    print('{:<25s} {:<9.3f} {:<9.3f} {:<9.3f} {:<9.3f}'.format(
        model_name,
        model_results['Train']['Train_mse'],
        model_results['Test']['Test_mse'],
        model_results['Train']['Train_R2'],
        model_results['Test']['Test_R2'],
    ))
print('='*80)

Model                     Train_MSE Test_MSE Train_R2 Test_R2 
Random_Forest_1           0.652     5.298     0.952     0.686    
Decision_tree_1           0.037     9.575     0.997     0.432    
Linear_Regression_1       4.432     5.514     0.671     0.673    
XGBoost_1                 3.589     4.647     0.733     0.724    
Lasso_1                   5.856     8.601     0.565     0.490    
Gradient_Boost_1          3.908     4.712     0.710     0.721    
High_Gradient_Boost_1     3.800     4.594     0.718     0.728    
