<a href="https://colab.research.google.com/github/VasanthPrakasam/Project--TripFare-Predicting-Urban-Taxi-Fare-with-Machine-Learning/blob/main/Feature_Engineering_TripFare_Predicting_Urban_TF_with_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Package**

In [None]:
pip install kagglehub

Note: you may need to restart the kernel to use updated packages.


## Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')

# Machine learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.feature_selection import SelectKBest, f_regression, chi2

# Statistical analysis
from scipy import stats
from scipy.stats import zscore

# Distance calculation
from math import radians, cos, sin, asin, sqrt

# Model persistence
import pickle

# Streamlit (for deployment)
import streamlit as st

print("✅ All libraries imported successfully!")

✅ All libraries imported successfully!


# Load and clean

In [None]:
import kagglehub
import warnings
warnings.filterwarnings("ignore")

# Download latest version
path = kagglehub.dataset_download("ivasanthp/tripfare-predicting-urban-taxi-fare")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\ivasa\.cache\kagglehub\datasets\ivasanthp\tripfare-predicting-urban-taxi-fare\versions\1


In [None]:
import os
os.listdir(path)

['taxi_fare.csv']

In [None]:
import pandas as pd
data = pd.read_csv(os.path.join(path, 'taxi_fare.csv'))
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2016-03-01 00:00:00,2016-03-01 00:07:55,1,-73.976746,40.765152,1,N,-74.004265,40.746128,1,9.0,0.5,0.5,2.05,0.0,0.3,12.35
1,1,2016-03-01 00:00:00,2016-03-01 00:11:06,1,-73.983482,40.767925,1,N,-74.005943,40.733166,1,11.0,0.5,0.5,3.05,0.0,0.3,15.35
2,2,2016-03-01 00:00:00,2016-03-01 00:31:06,2,-73.782021,40.64481,1,N,-73.974541,40.67577,1,54.5,0.5,0.5,8.0,0.0,0.3,63.8
3,2,2016-03-01 00:00:00,2016-03-01 00:00:00,3,-73.863419,40.769814,1,N,-73.96965,40.757767,1,31.5,0.0,0.5,3.78,5.54,0.3,41.62
4,2,2016-03-01 00:00:00,2016-03-01 00:00:00,5,-73.971741,40.792183,3,N,-74.17717,40.695053,1,98.0,0.0,0.0,0.0,15.5,0.3,113.8


## Let's lookup for the basic information of the dataset

#### Basic statistics

In [None]:
print("📊 BASIC STATISTICS:")
print(data.describe().T)

📊 BASIC STATISTICS:
                          count       mean        std         min        25%  \
VendorID               212345.0   1.708338   0.454529    1.000000   1.000000   
passenger_count        212345.0   1.766493   1.469647    0.000000   1.000000   
pickup_longitude       212345.0 -73.112122   7.936633 -121.933327 -73.990891   
pickup_latitude        212345.0  40.279405   4.370714    0.000000  40.740292   
RatecodeID             212345.0   1.034152   0.494699    1.000000   1.000000   
dropoff_longitude      212345.0 -73.144178   7.794599 -121.933327 -73.990570   
dropoff_latitude       212345.0  40.295648   4.293031    0.000000  40.740089   
payment_type           212345.0   1.324943   0.478255    1.000000   1.000000   
fare_amount            212345.0  12.665588  10.875584  -52.000000   6.500000   
extra                  212345.0   0.048077   0.148103   -0.500000   0.000000   
mta_tax                212345.0   0.497662   0.037436   -0.500000   0.500000   
tip_amount          

In [None]:
# Displaying first few rows for a quick look
data_head = data.head()
data_head

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2016-03-01 00:00:00,2016-03-01 00:07:55,1,-73.976746,40.765152,1,N,-74.004265,40.746128,1,9.0,0.5,0.5,2.05,0.0,0.3,12.35
1,1,2016-03-01 00:00:00,2016-03-01 00:11:06,1,-73.983482,40.767925,1,N,-74.005943,40.733166,1,11.0,0.5,0.5,3.05,0.0,0.3,15.35
2,2,2016-03-01 00:00:00,2016-03-01 00:31:06,2,-73.782021,40.64481,1,N,-73.974541,40.67577,1,54.5,0.5,0.5,8.0,0.0,0.3,63.8
3,2,2016-03-01 00:00:00,2016-03-01 00:00:00,3,-73.863419,40.769814,1,N,-73.96965,40.757767,1,31.5,0.0,0.5,3.78,5.54,0.3,41.62
4,2,2016-03-01 00:00:00,2016-03-01 00:00:00,5,-73.971741,40.792183,3,N,-74.17717,40.695053,1,98.0,0.0,0.0,0.0,15.5,0.3,113.8


In [None]:
data_info = data.info()
data_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212345 entries, 0 to 212344
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               212345 non-null  int64  
 1   tpep_pickup_datetime   212345 non-null  object 
 2   tpep_dropoff_datetime  212345 non-null  object 
 3   passenger_count        212345 non-null  int64  
 4   pickup_longitude       212345 non-null  float64
 5   pickup_latitude        212345 non-null  float64
 6   RatecodeID             212345 non-null  int64  
 7   store_and_fwd_flag     212345 non-null  object 
 8   dropoff_longitude      212345 non-null  float64
 9   dropoff_latitude       212345 non-null  float64
 10  payment_type           212345 non-null  int64  
 11  fare_amount            212345 non-null  float64
 12  extra                  212345 non-null  float64
 13  mta_tax                212345 non-null  float64
 14  tip_amount             212345 non-nu

# Data Preprocessing

In [None]:
# Convert UTC to EDT and extract time features
print("⏰ Converting pickup_datetime from UTC to EDT...")
data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'], utc=True)
data['tpep_pickup_datetime'] = data['tpep_pickup_datetime'].dt.tz_convert('America/New_York')
data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'], utc=True)
data['tpep_dropoff_datetime'] = data['tpep_dropoff_datetime'].dt.tz_convert('America/New_York')

⏰ Converting pickup_datetime from UTC to EDT...


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212345 entries, 0 to 212344
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype                           
---  ------                 --------------   -----                           
 0   VendorID               212345 non-null  int64                           
 1   tpep_pickup_datetime   212345 non-null  datetime64[ns, America/New_York]
 2   tpep_dropoff_datetime  212345 non-null  datetime64[ns, America/New_York]
 3   passenger_count        212345 non-null  int64                           
 4   pickup_longitude       212345 non-null  float64                         
 5   pickup_latitude        212345 non-null  float64                         
 6   RatecodeID             212345 non-null  int64                           
 7   store_and_fwd_flag     212345 non-null  object                          
 8   dropoff_longitude      212345 non-null  float64                         
 9   dropoff_latitude       212

In [None]:
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,2016-02-29 19:00:00-05:00,2016-02-29 19:07:55-05:00,1,-73.976746,40.765152,1,N,-74.004265,40.746128,1,9.0,0.5,0.5,2.05,0.0,0.3,12.35
1,1,2016-02-29 19:00:00-05:00,2016-02-29 19:11:06-05:00,1,-73.983482,40.767925,1,N,-74.005943,40.733166,1,11.0,0.5,0.5,3.05,0.0,0.3,15.35
2,2,2016-02-29 19:00:00-05:00,2016-02-29 19:31:06-05:00,2,-73.782021,40.64481,1,N,-73.974541,40.67577,1,54.5,0.5,0.5,8.0,0.0,0.3,63.8
3,2,2016-02-29 19:00:00-05:00,2016-02-29 19:00:00-05:00,3,-73.863419,40.769814,1,N,-73.96965,40.757767,1,31.5,0.0,0.5,3.78,5.54,0.3,41.62
4,2,2016-02-29 19:00:00-05:00,2016-02-29 19:00:00-05:00,5,-73.971741,40.792183,3,N,-74.17717,40.695053,1,98.0,0.0,0.0,0.0,15.5,0.3,113.8


## Feature Engineering
* trip_distance_calculated: Use Haversine formula (from pickup & dropoff coordinates)
* trip_duration_calculated:

#### radius of the the earth
* in miles = 3,963.1 mi
* in km = 6,378 km

In [None]:
import math

In [None]:
from math import radians, sin, cos, asin, sqrt
import pandas as pd

# Haversine distance function
def haversine_distance(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points on Earth (in miles)
    using the Haversine formula
    """
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * asin(sqrt(a))

    # Radius of Earth in miles
    r = 3963
    return c * r

In [None]:
# Feature engineering function
def feature_engineering(data):
    """
    Create new features from existing columns
    """
    print("🔧 Starting Feature Engineering...")
    data_clean = data.copy()

    # Calculate trip distance using Haversine formula
    print("📏 Calculating trip distances...")
    data_clean['trip_distance_calculated'] = data_clean.apply(
        lambda row: haversine_distance(
            row['pickup_longitude'], row['pickup_latitude'],
            row['dropoff_longitude'], row['dropoff_latitude']
        ), axis=1
    )

    print("✅ Feature engineering completed!")
    print(f"📊 New dataset shape: {data_clean.shape}")

    return data_clean

# Example usage:
# df_engineered will have all new features
data_engineered = feature_engineering(data)

# Now use df_engineered instead of data_clean
print(data_engineered.columns)

# If you want to replace the original dataset:
data = data_engineered.copy()


🔧 Starting Feature Engineering...
📏 Calculating trip distances...
✅ Feature engineering completed!
📊 New dataset shape: (212345, 19)
Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude', 'RatecodeID',
       'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude',
       'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
       'tolls_amount', 'improvement_surcharge', 'total_amount',
       'trip_distance_calculated'],
      dtype='object')


In [None]:
data['trip_duration_calculated'] = (data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']).dt.total_seconds() / 60

In [None]:
# Displaying the first few rows to check the conversion and the new column
data[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance_calculated','trip_duration_calculated']].head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance_calculated,trip_duration_calculated
0,2016-02-29 19:00:00-05:00,2016-02-29 19:07:55-05:00,1.952013,7.916667
1,2016-02-29 19:00:00-05:00,2016-02-29 19:11:06-05:00,2.676833,11.1
2,2016-02-29 19:00:00-05:00,2016-02-29 19:31:06-05:00,10.325908,31.1
3,2016-02-29 19:00:00-05:00,2016-02-29 19:00:00-05:00,5.62729,0.0
4,2016-02-29 19:00:00-05:00,2016-02-29 19:00:00-05:00,12.689576,0.0


In [None]:
print(f"Columns: {list(data.columns)}")

Columns: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'RatecodeID', 'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'trip_distance_calculated', 'trip_duration_calculated']


In [None]:
# Pickup time feature
data['pickup_year'] = data['tpep_pickup_datetime'].dt.year
data['pickup_month'] = data['tpep_pickup_datetime'].dt.month
data['pickup_weekday_name'] = data['tpep_pickup_datetime'].dt.day_name()
data['pickup_hour'] = data['tpep_pickup_datetime'].dt.hour

In [None]:
# Dropoff time features
data['dropoff_year'] = data['tpep_dropoff_datetime'].dt.year
data['dropoff_month'] = data['tpep_dropoff_datetime'].dt.month
data['dropoff_weekday_name'] = data['tpep_dropoff_datetime'].dt.day_name()
data['dropoff_hour'] = data['tpep_dropoff_datetime'].dt.hour

In [None]:
print(f"Columns: {list(data.columns)}")

Columns: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'RatecodeID', 'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'trip_distance_calculated', 'trip_duration_calculated', 'pickup_year', 'pickup_month', 'pickup_weekday_name', 'pickup_hour', 'dropoff_year', 'dropoff_month', 'dropoff_weekday_name', 'dropoff_hour']


In [None]:
data['pickup_weekday_name'].unique()

array(['Monday', 'Thursday', 'Tuesday'], dtype=object)

In [None]:
data['dropoff_weekday_name'].nunique()

5

### pickup_day: Extract weekday/weekend

In [None]:
# Weekend indicator
# data['is_weekend'] = (data['pickup_weekday'] >= 5).astype(int)
# This is will not work as the given Dataset has been available for these weekdays monday, tuesday and thursday which falls in weekdays.

### Extract am/pm

In [None]:
# AM/PM indicator
data['is_pm'] = (data['pickup_hour'] >= 12).astype(int)
data['is_pm'] = (data['dropoff_hour'] >= 12).astype(int)

### is_night: Binary flag for late-night/early-morning trips

In [None]:
# Night ride indicator (10 PM to 6 AM)
data['is_night'] = ((data['pickup_hour'] >= 22) | (data['pickup_hour'] <= 6)).astype(int)
data['is_night'] = ((data['dropoff_hour'] >= 22) | (data['dropoff_hour'] <= 6)).astype(int)

In [None]:
# Rush hour indicators
data['is_morning_rush'] = ((data['pickup_hour'] >= 7) & (data['pickup_hour'] <= 9)).astype(int)
data['is_evening_rush'] = ((data['pickup_hour'] >= 17) & (data['pickup_hour'] <= 19)).astype(int)
data['is_morning_rush'] = ((data['dropoff_hour'] >= 7) & (data['dropoff_hour'] <= 9)).astype(int)
data['is_evening_rush'] = ((data['dropoff_hour'] >= 17) & (data['dropoff_hour'] <= 19)).astype(int)

In [None]:
# Fare per mile and fare per minute
# data['fare_per_mile'] = data['fare_amount'] / (df['trip_distance_calculated'] + 1e-8)
# data['fare_per_minute'] = data['fare_amount'] / (df['trip_duration_minutes'] + 1e-8)

In [None]:
# Speed (miles per hour)
# data['speed_mph'] = (data['trip_distance_calculated'] * 60) / (data['trip_duration_minutes'] + 1e-8)

In [None]:
# Passenger density feature
# data['fare_per_passenger'] = data['fare_amount'] / (data['passenger_count'] + 1e-8)

In [None]:
print(f"Columns: {list(data.columns)}")

Columns: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'RatecodeID', 'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'trip_distance_calculated', 'trip_duration_calculated', 'pickup_year', 'pickup_month', 'pickup_weekday_name', 'pickup_hour', 'dropoff_year', 'dropoff_month', 'dropoff_weekday_name', 'dropoff_hour', 'is_pm', 'is_night', 'is_morning_rush', 'is_evening_rush']


In [None]:
data.shape

(212345, 32)

In [None]:
data.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime','store_and_fwd_flag','fare_amount', 'extra', 'mta_tax', 'tip_amount','tolls_amount', 'improvement_surcharge'], axis=1, inplace=True)

In [None]:
# Data types
print("🔍 DATA TYPES:")
print(data.dtypes)
print("-" * 40)  # separator line
print(data.shape)

🔍 DATA TYPES:
VendorID                      int64
passenger_count               int64
pickup_longitude            float64
pickup_latitude             float64
RatecodeID                    int64
dropoff_longitude           float64
dropoff_latitude            float64
payment_type                  int64
total_amount                float64
trip_distance_calculated    float64
trip_duration_calculated    float64
pickup_year                   int32
pickup_month                  int32
pickup_weekday_name          object
pickup_hour                   int32
dropoff_year                  int32
dropoff_month                 int32
dropoff_weekday_name         object
dropoff_hour                  int32
is_pm                         int64
is_night                      int64
is_morning_rush               int64
is_evening_rush               int64
dtype: object
----------------------------------------
(212345, 23)


In [None]:
print(data.isnull().sum())
print("-" * 40)  # separator line
print(data.isna().sum())

VendorID                    0
passenger_count             0
pickup_longitude            0
pickup_latitude             0
RatecodeID                  0
dropoff_longitude           0
dropoff_latitude            0
payment_type                0
total_amount                0
trip_distance_calculated    0
trip_duration_calculated    0
pickup_year                 0
pickup_month                0
pickup_weekday_name         0
pickup_hour                 0
dropoff_year                0
dropoff_month               0
dropoff_weekday_name        0
dropoff_hour                0
is_pm                       0
is_night                    0
is_morning_rush             0
is_evening_rush             0
dtype: int64
----------------------------------------
VendorID                    0
passenger_count             0
pickup_longitude            0
pickup_latitude             0
RatecodeID                  0
dropoff_longitude           0
dropoff_latitude            0
payment_type                0
total_amount    

In [None]:
data.duplicated().sum()

np.int64(6)