In [55]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import joblib

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer #For creating Full Pipelines

# Train and Test Models on the Training Set
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("../data/train.csv")

In [4]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


<h1>Getting distance from Cordinates</h1>

In [6]:
locs = pd.DataFrame({
    'lat1':df['pickup_longitude'],
    'lon1':df['pickup_latitude'],
    'lat2':df['dropoff_longitude'],
    'lon2':df['pickup_latitude']
})

In [7]:

def haversine_vector(lat1, lon1, lat2, lon2, radius=6371):
    """
    Calculate the great-circle distance between two sets of coordinates
    using the Haversine formula (vectorized).

    Parameters
    ----------
    lat1, lon1 : array-like
        Latitudes & longitudes of the first set of points (in degrees).
    lat2, lon2 : array-like
        Latitudes & longitudes of the second set of points (in degrees).
    radius : float
        Earth radius in kilometers (default 6371 km). Use 3956 for miles.

    Returns
    -------
    numpy.ndarray
        Distances in kilometers (same shape as input arrays).
    """
    # Convert degrees to radians
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    # Compute differences
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return radius * c

In [8]:
df['distance_km'] = haversine_vector(locs.lat1, locs.lon1, locs.lat2, locs.lon2)

<h1>For pickup date and time</h1>

In [10]:
def add_day_and_duration(data,pic_time):
    data[pic_time] = pd.to_datetime(data[pic_time])
    data['week_day'] = data[pic_time].dt.day_name()
    return data

In [11]:
df = add_day_and_duration(df, 'pickup_datetime')

In [24]:
# Basic features for tree models
df['hour'] = df['pickup_datetime'].dt.hour
df['minute'] = df['pickup_datetime'].dt.minute

In [25]:

df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)


In [26]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance_km,week_day,hour_sin,hour_cos,minute_sin,minute_cos,hour,minute
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1.94866,Monday,-0.965926,-0.258819,0.5877853,-0.809017,17,24
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,2.120027,Sunday,0.0,1.0,-0.9781476,-0.207912,0,43
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,2.925111,Tuesday,0.258819,-0.965926,-0.5,-0.866025,11,35
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,0.247718,Wednesday,-0.965926,0.258819,-0.2079117,-0.978148,19,32
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,0.014422,Saturday,-0.258819,-0.965926,5.665539e-16,-1.0,13,30


<h1>Extracting the id from id column and convert it in int</h1>

In [27]:
df['id'] = df['id'].str.extract("(\d+)").astype(int)

  df['id'] = df['id'].str.extract("(\d+)").astype(int)


In [28]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance_km,week_day,hour_sin,hour_cos,minute_sin,minute_cos,hour,minute
0,2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1.94866,Monday,-0.965926,-0.258819,0.5877853,-0.809017,17,24
1,2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,2.120027,Sunday,0.0,1.0,-0.9781476,-0.207912,0,43
2,3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,2.925111,Tuesday,0.258819,-0.965926,-0.5,-0.866025,11,35
3,3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,0.247718,Wednesday,-0.965926,0.258819,-0.2079117,-0.978148,19,32
4,2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,0.014422,Saturday,-0.258819,-0.965926,5.665539e-16,-1.0,13,30


In [29]:
df.drop(["pickup_datetime","dropoff_datetime", "store_and_fwd_flag" ],axis=1,inplace=True)
df.head()

Unnamed: 0,id,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,distance_km,week_day,hour_sin,hour_cos,minute_sin,minute_cos,hour,minute
0,2875421,2,1,-73.982155,40.767937,-73.96463,40.765602,455,1.94866,Monday,-0.965926,-0.258819,0.5877853,-0.809017,17,24
1,2377394,1,1,-73.980415,40.738564,-73.999481,40.731152,663,2.120027,Sunday,0.0,1.0,-0.9781476,-0.207912,0,43
2,3858529,2,1,-73.979027,40.763939,-74.005333,40.710087,2124,2.925111,Tuesday,0.258819,-0.965926,-0.5,-0.866025,11,35
3,3504673,2,1,-74.01004,40.719971,-74.012268,40.706718,429,0.247718,Wednesday,-0.965926,0.258819,-0.2079117,-0.978148,19,32
4,2181028,2,1,-73.973053,40.793209,-73.972923,40.78252,435,0.014422,Saturday,-0.258819,-0.965926,5.665539e-16,-1.0,13,30


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 16 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   id                 1458644 non-null  int64  
 1   vendor_id          1458644 non-null  int64  
 2   passenger_count    1458644 non-null  int64  
 3   pickup_longitude   1458644 non-null  float64
 4   pickup_latitude    1458644 non-null  float64
 5   dropoff_longitude  1458644 non-null  float64
 6   dropoff_latitude   1458644 non-null  float64
 7   trip_duration      1458644 non-null  int64  
 8   distance_km        1458644 non-null  float64
 9   week_day           1458644 non-null  object 
 10  hour_sin           1458644 non-null  float64
 11  hour_cos           1458644 non-null  float64
 12  minute_sin         1458644 non-null  float64
 13  minute_cos         1458644 non-null  float64
 14  hour               1458644 non-null  int32  
 15  minute             1458644 non-n

<h1>Spliting the Dataset into Train Test </h1>

In [41]:

X = df.drop(columns=['trip_duration'])  
y = df['trip_duration']

# Split into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.4,       # 20% test set
    random_state=42,     # ensures reproducibility
    shuffle=True         # shuffles before splitting (recommended)
)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (875186, 15)
X_test shape: (583458, 15)
y_train shape: (875186,)
y_test shape: (583458,)


<h1>Handling the Categorica; data for both test and train </h1>

In [43]:
# Seperating Numerical and categorical Attributes from training data
train_num_attribs = X_train.drop("week_day", axis=1)
train_cat_attribs = X_train[["week_day"]]


# Seperating Numerical and categorical Attributes from testing  data
test_num_attribs = X_test.drop("week_day", axis=1)
test_cat_attribs = X_test[["week_day"]]

In [None]:
def build_pipeline(num_attribs, cat_attribs):
    # NUmerical pipeline
    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    #Categorical pipeline
    cat_pipeline = Pipeline([
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    # Full Pipeline 
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs.columns),
        ("cat", cat_pipeline, cat_attribs.columns)
    ])

    return full_pipeline

# Applying the full pipeline in train and test 
train_pipeline = build_pipeline(train_num_attribs,train_cat_attribs)
# test_pipeline = build_pipeline(test_num_attribs,test_cat_attribs)  ❌ Not fitted! It’s a new pipeline

In [51]:
# Example: Fit and transform training data
X_train_prepared = train_pipeline.fit_transform(X_train)
X_test_prepared = train_pipeline.transform(X_test)

X_test_prepared.shape
X_train_prepared.shape

(583458, 21)

<h1>Training the models</h1>

In [52]:
#Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)


In [53]:
# Decission Tree
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train_prepared, y_train)

In [54]:
# Random Forest
forest_reg = RandomForestRegressor(
    n_estimators=50,      # default 100
    max_depth=15,         # limit depth
    max_features='sqrt',  # use fewer features per split
    n_jobs=-1,
    random_state=42)
forest_reg.fit(X_train_prepared, y_train)

In [56]:
# Calculate RMSE
lin_rmse = -cross_val_score(
    lin_reg,
    X_train_prepared,
    y_train,
    scoring="neg_root_mean_squared_error",
    cv=10)

print("Linear Regrosser CV RMSEs:", lin_rmse)
print("\nCross-Validation Performance (Linear Regression):")
print(pd.Series(lin_rmse).describe())

Linear Regrosser CV RMSEs: [ 3195.15024263  3329.13075843  8188.98871792  3159.76217151
  3197.76262471  3229.36014552  3242.7444462  14123.08486114
  2960.61984701  7207.28324273]

Cross-Validation Performance (Linear Regression):
count       10.000000
mean      5183.388706
std       3666.698743
min       2960.619847
25%       3195.803338
50%       3236.052296
75%       6237.745122
max      14123.084861
dtype: float64


In [57]:
tree_rmse = -cross_val_score(
    tree_reg,
    X_train_prepared,
    y_train,
    scoring="neg_root_mean_squared_error",
    cv=10)
print("Decision Tree CV RMSEs:", tree_rmse)
print("\nCross-Validation Performance (Decision Tree):")
print(pd.Series(tree_rmse).describe())

Decision Tree CV RMSEs: [ 8060.54787714  8468.1026306   8923.83708995  4895.75538359
  4756.36536256  4876.73507417  4912.18693882 14623.52716061
  8016.59357744 17875.09778805]

Cross-Validation Performance (Decision Tree):
count       10.000000
mean      8540.874888
std       4459.529397
min       4756.365363
25%       4899.863272
50%       8038.570727
75%       8809.903475
max      17875.097788
dtype: float64


In [58]:
forest_rmse = -cross_val_score(
    forest_reg,
    X_train_prepared,
    y_train,
    scoring="neg_root_mean_squared_error",
    cv=10)

print("Random Forest CV RMSEs:", forest_rmse)
print("\nCross-Validation Performance (Random Forest):")
print(pd.Series(forest_rmse).describe())

Random Forest CV RMSEs: [ 3677.84133967  3404.8446617   8212.05576524  3294.20777106
  3377.47438809  3304.55586784  3298.1671851  14139.300223
  3112.59096871  7249.45735897]

Cross-Validation Performance (Random Forest):
count       10.000000
mean      5307.049553
std       3607.544472
min       3112.590969
25%       3299.764356
50%       3391.159525
75%       6356.553354
max      14139.300223
dtype: float64


In [60]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²: {r2:.2f}")
    return mae, rmse, r2

print("Linear Regression Performance:")
evaluate_model(lin_reg,X_test_prepared, y_test)

print("\nRandom Forest Performance:")
evaluate_model(forest_reg, X_test_prepared, y_test)


Linear Regression Performance:
MAE: 488.08
RMSE: 3089.13
R²: 0.02

Random Forest Performance:
MAE: 443.62
RMSE: 3188.57
R²: -0.04


(443.6205939634009, np.float64(3188.569979787243), -0.04334147586411485)