## Library Imports


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from math import *
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor 
from sklearn.model_selection import GridSearchCV
from statsmodels.tools.eval_measures import rmse
from sklearn import preprocessing
import statsmodels.api as sm

## Dataset Load

In [None]:
data = pd.read_csv('../data/nyc.csv')

In [3]:
df = data.copy()
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [4]:
df.drop(['Unnamed: 0', 'key'], axis = 1, inplace=True)

In [5]:
df.shape

(200000, 7)

In [6]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        200000 non-null  float64
 1   pickup_datetime    200000 non-null  object 
 2   pickup_longitude   200000 non-null  float64
 3   pickup_latitude    200000 non-null  float64
 4   dropoff_longitude  199999 non-null  float64
 5   dropoff_latitude   199999 non-null  float64
 6   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 10.7+ MB


In [8]:
df.isnull().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [9]:
print(df[df['dropoff_longitude'].isna()])
df.dropna(inplace=True)

       fare_amount          pickup_datetime  pickup_longitude  \
87946         24.1  2013-07-02 03:51:57 UTC        -73.950581   

       pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  
87946        40.779692                NaN               NaN                0  


## Data Preprocessing

In [10]:
df['pickup_datetime']

0         2015-05-07 19:52:06 UTC
1         2009-07-17 20:04:56 UTC
2         2009-08-24 21:45:00 UTC
3         2009-06-26 08:22:21 UTC
4         2014-08-28 17:47:00 UTC
                   ...           
199995    2012-10-28 10:49:00 UTC
199996    2014-03-14 01:09:00 UTC
199997    2009-06-29 00:42:00 UTC
199998    2015-05-20 14:56:25 UTC
199999    2010-05-15 04:08:00 UTC
Name: pickup_datetime, Length: 199999, dtype: object

In [11]:
df.pickup_datetime = pd.to_datetime(df.pickup_datetime, errors='coerce') 
df['pickup_datetime']

0        2015-05-07 19:52:06+00:00
1        2009-07-17 20:04:56+00:00
2        2009-08-24 21:45:00+00:00
3        2009-06-26 08:22:21+00:00
4        2014-08-28 17:47:00+00:00
                    ...           
199995   2012-10-28 10:49:00+00:00
199996   2014-03-14 01:09:00+00:00
199997   2009-06-29 00:42:00+00:00
199998   2015-05-20 14:56:25+00:00
199999   2010-05-15 04:08:00+00:00
Name: pickup_datetime, Length: 199999, dtype: datetime64[ns, UTC]

In [12]:
df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [13]:
df.isna().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199999 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   fare_amount        199999 non-null  float64            
 1   pickup_datetime    199999 non-null  datetime64[ns, UTC]
 2   pickup_longitude   199999 non-null  float64            
 3   pickup_latitude    199999 non-null  float64            
 4   dropoff_longitude  199999 non-null  float64            
 5   dropoff_latitude   199999 non-null  float64            
 6   passenger_count    199999 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(5), int64(1)
memory usage: 12.2 MB


In [15]:
df['pickup_datetime'] =  pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S UTC')

df['year'] = df['pickup_datetime'].dt.year
df['Month'] = df['pickup_datetime'].dt.month
df['Date'] = df['pickup_datetime'].dt.day
df['Day'] = df['pickup_datetime'].dt.dayofweek
df['Hour'] = df['pickup_datetime'].dt.hour
df['Minute'] = df['pickup_datetime'].dt.minute

In [16]:
df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,Month,Date,Day,Hour,Minute
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,3,19,52
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.994710,40.750325,1,2009,7,17,4,20,4
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.740770,-73.962565,40.772647,1,2009,8,24,0,21,45
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,4,8,22
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,3,17,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,3.0,2012-10-28 10:49:00+00:00,-73.987042,40.739367,-73.986525,40.740297,1,2012,10,28,6,10,49
199996,7.5,2014-03-14 01:09:00+00:00,-73.984722,40.736837,-74.006672,40.739620,1,2014,3,14,4,1,9
199997,30.9,2009-06-29 00:42:00+00:00,-73.986017,40.756487,-73.858957,40.692588,2,2009,6,29,0,0,42
199998,14.5,2015-05-20 14:56:25+00:00,-73.997124,40.725452,-73.983215,40.695415,1,2015,5,20,2,14,56


### Heversine Formula


In [17]:
def distance_transform(longitude1, latitude1, longitude2, latitude2):
    travel_dist = []
    
    for pos in range(len(longitude1)):
        long1,lati1,long2,lati2 = map(radians,[longitude1[pos],latitude1[pos],longitude2[pos],latitude2[pos]])
        dist_long = long2 - long1
        dist_lati = lati2 - lati1
        a = sin(dist_lati/2)**2 + cos(lati1) * cos(lati2) * sin(dist_long/2)**2
        c = 2 * asin(sqrt(a))*6371
        travel_dist.append(c)
       
    return travel_dist

In [18]:
df['dist_travel_km'] = distance_transform(df['pickup_longitude'].to_numpy(),df['pickup_latitude'].to_numpy(),df['dropoff_longitude'].to_numpy(),df['dropoff_latitude'].to_numpy())

In [19]:
df

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,Month,Date,Day,Hour,Minute,dist_travel_km
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,3,19,52,1.683323
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.994710,40.750325,1,2009,7,17,4,20,4,2.457590
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.740770,-73.962565,40.772647,1,2009,8,24,0,21,45,5.036377
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,4,8,22,1.661683
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,3,17,47,4.475450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,3.0,2012-10-28 10:49:00+00:00,-73.987042,40.739367,-73.986525,40.740297,1,2012,10,28,6,10,49,0.112210
199996,7.5,2014-03-14 01:09:00+00:00,-73.984722,40.736837,-74.006672,40.739620,1,2014,3,14,4,1,9,1.875050
199997,30.9,2009-06-29 00:42:00+00:00,-73.986017,40.756487,-73.858957,40.692588,2,2009,6,29,0,0,42,12.850319
199998,14.5,2015-05-20 14:56:25+00:00,-73.997124,40.725452,-73.983215,40.695415,1,2015,5,20,2,14,56,3.539715


In [20]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fare_amount,199999.0,11.359892,9.90176,-52.0,6.0,8.5,12.5,499.0
pickup_longitude,199999.0,-72.527631,11.437815,-1340.64841,-73.992065,-73.981823,-73.967154,57.418457
pickup_latitude,199999.0,39.935881,7.720558,-74.015515,40.734796,40.752592,40.767158,1644.421482
dropoff_longitude,199999.0,-72.525292,13.117408,-3356.6663,-73.991407,-73.980093,-73.963658,1153.572603
dropoff_latitude,199999.0,39.92389,6.794829,-881.985513,40.733823,40.753042,40.768001,872.697628
passenger_count,199999.0,1.684543,1.385995,0.0,1.0,1.0,2.0,208.0
year,199999.0,2011.742434,1.8564,2009.0,2010.0,2012.0,2013.0,2015.0
Month,199999.0,6.281791,3.438933,1.0,3.0,6.0,9.0,12.0
Date,199999.0,15.704739,8.687377,1.0,8.0,16.0,23.0,31.0
Day,199999.0,3.048435,1.946946,0.0,1.0,3.0,5.0,6.0


In [21]:
df.columns[df.dtypes == 'object']

Index([], dtype='object')

In [22]:
df = df.loc[(df.fare_amount >= 0)]

In [23]:
df.shape

(199982, 14)

In [24]:

medianFiller = lambda x : x.fillna(x.median())
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
# remove passenger counts, fare amount, 
cols_to_remove = ['Date', 'passenger_count', 'year', 'Month', 'Day', 'Hour', 'Minute']
numeric_columns = [col for col in numeric_columns if col not in cols_to_remove]
df[numeric_columns] = df[numeric_columns].apply(medianFiller, axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = df[numeric_columns].apply(medianFiller, axis = 0)


In [25]:
numeric_columns

['fare_amount',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'dist_travel_km']

In [26]:
# plt.figure(figsize =(20, 30))

# for i , variable in enumerate(df.select_dtypes(include=np.number).columns.tolist()):
#     plt.subplot(6 , 4, i + 1)
#     plt.boxplot(df[variable], whis=1.5)
#     plt.tight_layout()
#     plt.title(variable)
    
# plt.show()

In [27]:
df.loc[df['passenger_count'] > 6,'passenger_count'] = np.nan
df['passenger_count'] = df['passenger_count'].fillna(6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['passenger_count'] = df['passenger_count'].fillna(6)


In [28]:
def remove_outlier(df1 , col):
    Q1 = df1[col].quantile(0.25)
    Q3 = df1[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_whisker = Q1-1.5*IQR
    upper_whisker = Q3+1.5*IQR
    df[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
    return df1

def treat_outliers_all(df1 , col_list):
    for c in col_list:
        df1 = remove_outlier(df , c)
    return df1

In [29]:
df = treat_outliers_all(df, numeric_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
A value is trying to be s

In [30]:
df.shape

(199982, 14)

In [31]:
# plt.figure(figsize =(20, 30))

# for i , variable in enumerate(df.select_dtypes(include=np.number).columns.tolist()):
#     plt.subplot(6 , 4, i +1)
#     plt.boxplot(df[variable], whis=1.5)
#     plt.tight_layout()
#     plt.title(variable)
    
# plt.show()

In [32]:
df = df.loc[(df.dist_travel_km >= 1) | (df.dist_travel_km <= 130)]

In [33]:
df.shape

(199982, 14)

In [34]:
incorrect_coordinates = df.loc[(df.pickup_latitude > 90) |(df.pickup_latitude < -90) |
(df.dropoff_latitude > 90) |(df.dropoff_latitude < -90) | (df.pickup_longitude > 180) |(df.pickup_longitude < -180) | (df.dropoff_longitude > 90) |(df.dropoff_longitude < -90)]
     

In [35]:
df.drop(incorrect_coordinates, inplace = True, errors = "ignore")

In [36]:
df.drop(['pickup_datetime'], inplace = True, axis = 1)

In [37]:
# correlation matrix
# corr = df.corr()
# corr

In [38]:
df.columns

Index(['fare_amount', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year',
       'Month', 'Date', 'Day', 'Hour', 'Minute', 'dist_travel_km'],
      dtype='object')

In [39]:
# plt.figure(figsize=(40, 30)) 
# mask = np.zeros_like(corr)
# mask[np.tril_indices_from(mask, k = -1)] = True
# sns.heatmap(corr, cmap = 'RdYlGn', vmax = 1.0, vmin = -1.0, annot = True, annot_kws = {"size": 20}, mask = mask)
# plt.xticks(fontsize = 15)
# plt.yticks(fontsize = 18)
# plt.show()

In [40]:
y = pd.DataFrame(df['fare_amount'])
x = df.drop('fare_amount', axis =1)

In [41]:

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [42]:
x_train.shape

(149986, 12)

In [43]:

linreg_full = sm.OLS(y_train, x_train).fit()
print(linreg_full.summary())
linreg_full_prediction = linreg_full.predict(x_test)
linreg_full_prediction

                                 OLS Regression Results                                
Dep. Variable:            fare_amount   R-squared (uncentered):                   0.941
Model:                            OLS   Adj. R-squared (uncentered):              0.941
Method:                 Least Squares   F-statistic:                          1.988e+05
Date:                Thu, 10 Apr 2025   Prob (F-statistic):                        0.00
Time:                        15:47:14   Log-Likelihood:                     -3.6668e+05
No. Observations:              149986   AIC:                                  7.334e+05
Df Residuals:                  149974   BIC:                                  7.335e+05
Df Model:                          12                                                  
Covariance Type:            nonrobust                                                  
                        coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------

50556     19.853547
71417      9.756571
180273     7.326635
89727      7.064263
148308     7.554710
            ...    
186420    11.815688
42641      8.359770
110052     9.643809
107233     5.533075
140757     8.489772
Length: 49996, dtype: float64

In [44]:

linreg_full_rmse = rmse(y_test['fare_amount'], linreg_full_prediction)
linreg_full_rsquared = linreg_full.rsquared
linreg_full_rsquared_adj = linreg_full.rsquared_adj

In [45]:
data = {'Model': [], 'RMSE' : [], 'R-Squared': [], 'Adj. R-Squared' : []}
model = pd.DataFrame(data)
model.loc[len(model.index)] = ["Linreg full Model", linreg_full_rmse, linreg_full_rsquared, linreg_full_rsquared_adj]
model

Unnamed: 0,Model,RMSE,R-Squared,Adj. R-Squared
0,Linreg full Model,2.735173,0.940855,0.940851


In [46]:
def model_metrics(model_name, models, x_test, y_test):
    y_pred = models.predict(x_test)
    r_squared_DT = models.score(x_test, y_test)
    n = y_test.shape[0]
    p = x_test.shape[1]
    Adj_r_squared_DT = 1 - (1-r_squared_DT)*(n-1)/(n-p-1) 
    rmse_DT = sqrt(metrics.mean_squared_error(y_test, y_pred))
    model.loc[len(model.index)] = [model_name, rmse_DT, r_squared_DT, Adj_r_squared_DT]

In [47]:
decision_tree = DecisionTreeRegressor(criterion = 'friedman_mse', random_state = 10)  
decision_tree_model = decision_tree.fit(x_train, y_train)
model_metrics("Decision Tree", decision_tree_model, x_test, y_test)

In [48]:
prune = DecisionTreeRegressor(max_depth = 10, max_leaf_nodes = 32 , random_state = 10) 
decision_tree_prune = prune.fit(x_train, y_train)

model_metrics("Decision Tree Prune", decision_tree_prune, x_test, y_test)

In [49]:
gbr = GradientBoostingRegressor(loss='absolute_error', learning_rate=0.1, n_estimators=1000, max_depth = 1, random_state = 42, max_features = 8)
gbr.fit(x_train,  y_train.values.ravel())
model_metrics("Gradient Boosting", gbr, x_test, y_test)

In [50]:

def train_fare_model(x_train, y_train):
    # Base models
    lr = LinearRegression()
    rr = Ridge()
    mlp = MLPRegressor()
    gbr = GradientBoostingRegressor(random_state=42)

    # Grid search for each
    models = [
        ('lr', GridSearchCV(lr, {'fit_intercept': [True]}, scoring='neg_root_mean_squared_error', cv=5)),
        ('rr', GridSearchCV(rr, {'alpha': [0.2], 'fit_intercept': [True]}, scoring='neg_root_mean_squared_error', cv=5)),
        ('mlp', GridSearchCV(mlp, {'hidden_layer_sizes': [(30,)], 'max_iter': [300]}, scoring='neg_root_mean_squared_error', cv=5)),
        # ('gbr', GridSearchCV(
        #     gbr,
        #     {'n_estimators': [1000], 'learning_rate': [0.1], 'max_depth': [1], 'max_features': [8]},
        #     scoring='neg_root_mean_squared_error',
        #     cv=3
        # ))
    ]
    # Train each model
    for name, model in models:
        model.fit(x_train, y_train)

    # Create ensemble from best estimators
    ensemble = VotingRegressor(estimators=[(name, model.best_estimator_) for name, model in models])
    ensemble.fit(x_train, y_train)

    return ensemble

In [51]:
ensemble = train_fare_model(x_train, y_train)
model_metrics("Ensemble Model", ensemble, x_test, y_test)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [52]:
model

Unnamed: 0,Model,RMSE,R-Squared,Adj. R-Squared
0,Linreg full Model,2.735173,0.940855,0.940851
1,Decision Tree,3.431066,0.598862,0.598766
2,Decision Tree Prune,2.577265,0.773664,0.77361
3,Gradient Boosting,2.57764,0.773598,0.773544
4,Ensemble Model,2.713494,0.749104,0.749044


In [53]:
import pickle 
pickle.dump(linreg_full, open('linreg.pkl', 'wb')) 
pickle.dump(decision_tree_model, open('decision_tree.pkl', 'wb'))
pickle.dump(decision_tree_prune, open('decision_tree_prune.pkl', 'wb')) 
pickle.dump(gbr, open('gbr.pkl', 'wb'))
pickle.dump(ensemble, open('ensemble.pkl', 'wb'))