In [10]:
import pandas as pd
import numpy as np

df = pd.read_csv("taxi_trip_pricing.csv")

df.head(20)

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618
5,8.64,Afternoon,Weekend,2.0,Medium,Clear,2.55,1.71,0.48,89.33,60.2028
6,3.85,Afternoon,Weekday,4.0,High,Rain,3.51,1.66,,5.05,11.2645
7,43.44,Evening,Weekend,3.0,,Clear,2.97,1.87,0.23,,101.1216
8,30.45,Morning,Weekday,3.0,High,Clear,2.77,1.78,0.34,110.33,
9,35.7,Afternoon,Weekday,2.0,Low,Rain,3.39,1.52,0.47,,75.5657


In [11]:
df.describe()

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
count,950.0,950.0,950.0,950.0,950.0,950.0,951.0
mean,27.070547,2.476842,3.502989,1.233316,0.292916,62.118116,56.874773
std,19.9053,1.102249,0.870162,0.429816,0.115592,32.154406,40.469791
min,1.23,1.0,2.01,0.5,0.1,5.01,6.1269
25%,12.6325,1.25,2.73,0.86,0.19,35.8825,33.74265
50%,25.83,2.0,3.52,1.22,0.29,61.86,50.0745
75%,38.405,3.0,4.26,1.61,0.39,89.055,69.09935
max,146.067047,4.0,5.0,2.0,0.5,119.84,332.043689


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       950 non-null    float64
 1   Time_of_Day            950 non-null    object 
 2   Day_of_Week            950 non-null    object 
 3   Passenger_Count        950 non-null    float64
 4   Traffic_Conditions     950 non-null    object 
 5   Weather                950 non-null    object 
 6   Base_Fare              950 non-null    float64
 7   Per_Km_Rate            950 non-null    float64
 8   Per_Minute_Rate        950 non-null    float64
 9   Trip_Duration_Minutes  950 non-null    float64
 10  Trip_Price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


In [13]:
df.isnull().sum()

Trip_Distance_km         50
Time_of_Day              50
Day_of_Week              50
Passenger_Count          50
Traffic_Conditions       50
Weather                  50
Base_Fare                50
Per_Km_Rate              50
Per_Minute_Rate          50
Trip_Duration_Minutes    50
Trip_Price               49
dtype: int64

## Handling missing values

In [17]:
# --- Define Constants ---
TARGET_COLUMN = 'Trip_Price'
MODE_IMPUTE_FEATURES = ['Passenger_Count']

# Get all columns that are NOT the target
feature_columns = df.columns.drop(TARGET_COLUMN)
features_to_impute = df[feature_columns]

# Define Groups
all_numerical = features_to_impute.select_dtypes(include=[np.number]).columns
all_categorical = features_to_impute.select_dtypes(include=['object']).columns

# Numerical features that will be filled with the MEAN (Excludes Passenger_Count)
numerical_mean_impute = [col for col in all_numerical if col not in MODE_IMPUTE_FEATURES]

# Features that will be filled with the MODE (Passenger_Count + all Categorical)
mode_impute_features = MODE_IMPUTE_FEATURES + list(all_categorical)

print(f"Features (Mean Imputation): {numerical_mean_impute}")
print(f"Features (Mode Imputation): {mode_impute_features}\n")


# Impute numerical features with the MEAN
for feature in numerical_mean_impute:
    # Calculate the mean and fill NA values in the original DataFrame
    mean_value = df[feature].mean()
    df[feature].fillna(mean_value)
    
# Impute Passenger_Count and categorical features with the MODE
for feature in mode_impute_features:
    # Calculate the mode and fill NA values in the original DataFrame
    mode_value = df[feature].mode()[0]
    df[feature].fillna(mode_value)


print("Missing Value Counts After Imputation:")
print(df.isnull().sum())

Features (Mean Imputation): ['Trip_Distance_km', 'Base_Fare', 'Per_Km_Rate', 'Per_Minute_Rate', 'Trip_Duration_Minutes']
Features (Mode Imputation): ['Passenger_Count', 'Time_of_Day', 'Day_of_Week', 'Traffic_Conditions', 'Weather']

Missing Value Counts After Imputation:
Trip_Distance_km          0
Time_of_Day               0
Day_of_Week               0
Passenger_Count           0
Traffic_Conditions        0
Weather                   0
Base_Fare                 0
Per_Km_Rate               0
Per_Minute_Rate           0
Trip_Duration_Minutes     0
Trip_Price               49
dtype: int64


In [19]:
# lets handle our missing values in our target column 'Trip_Price' next for all rows where Trip_Price is missing

def calculate_trip_price(row):
    return (row['Trip_Distance_km'] * row['Per_Km_Rate']) + \
           (row['Per_Minute_Rate'] * row['Trip_Duration_Minutes']) + \
           row['Base_Fare']

missing_target_mask = df['Trip_Price'].isnull()
df.loc[missing_target_mask, 'Trip_Price'] = df[missing_target_mask].apply(calculate_trip_price, axis=1)

print("Missing Value Counts After Imputing Target:")
print(df.isnull().sum())

Missing Value Counts After Imputing Target:
Trip_Distance_km         0
Time_of_Day              0
Day_of_Week              0
Passenger_Count          0
Traffic_Conditions       0
Weather                  0
Base_Fare                0
Per_Km_Rate              0
Per_Minute_Rate          0
Trip_Duration_Minutes    0
Trip_Price               0
dtype: int64
