# Taxi Price Prediction Model

## 1. Loading the dataset and Exploratory Data Analysis

In [67]:
# importing essential libaries
import pandas as pd

In [68]:
# Loading the dataset and printing the first 5 rows to ensure its loaded correctly
dataset_path = '../Dataset/taxi_trip_pricing.csv'
data = pd.read_csv(dataset_path)
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618


In [69]:
# Checking the number of rows and columns in the dataset
df.shape

(1000, 11)

In [70]:
# Checking what are the columns in the df
df.columns

Index(['Trip_Distance_km', 'Time_of_Day', 'Day_of_Week', 'Passenger_Count',
       'Traffic_Conditions', 'Weather', 'Base_Fare', 'Per_Km_Rate',
       'Per_Minute_Rate', 'Trip_Duration_Minutes', 'Trip_Price'],
      dtype='object')

In [71]:
# Checking the data types of the dataset
df.dtypes

Trip_Distance_km         float64
Time_of_Day               object
Day_of_Week               object
Passenger_Count          float64
Traffic_Conditions        object
Weather                   object
Base_Fare                float64
Per_Km_Rate              float64
Per_Minute_Rate          float64
Trip_Duration_Minutes    float64
Trip_Price               float64
dtype: object

In [72]:
# Getting 5 numbers summary of the dataset
df.describe()

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
count,950.0,950.0,950.0,950.0,950.0,950.0,951.0
mean,27.070547,2.476842,3.502989,1.233316,0.292916,62.118116,56.874773
std,19.9053,1.102249,0.870162,0.429816,0.115592,32.154406,40.469791
min,1.23,1.0,2.01,0.5,0.1,5.01,6.1269
25%,12.6325,1.25,2.73,0.86,0.19,35.8825,33.74265
50%,25.83,2.0,3.52,1.22,0.29,61.86,50.0745
75%,38.405,3.0,4.26,1.61,0.39,89.055,69.09935
max,146.067047,4.0,5.0,2.0,0.5,119.84,332.043689


## 2. Handling Missing Values

In [73]:
# Checking the missing values in the dataset
Missing_Values = df.isnull()
Missing_Values

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,True,False,False,False,True
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,True,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False,True,False,False
997,False,False,False,False,False,False,False,False,True,False,False
998,False,False,False,False,False,False,False,False,False,False,False


In [74]:
for col in Missing_Values:
    print(f"{col} has {df[col].isnull().sum()} missing values and Data type: {df[col].dtype}")

Trip_Distance_km has 50 missing values and Data type: float64
Time_of_Day has 50 missing values and Data type: object
Day_of_Week has 50 missing values and Data type: object
Passenger_Count has 50 missing values and Data type: float64
Traffic_Conditions has 50 missing values and Data type: object
Weather has 50 missing values and Data type: object
Base_Fare has 50 missing values and Data type: float64
Per_Km_Rate has 50 missing values and Data type: float64
Per_Minute_Rate has 50 missing values and Data type: float64
Trip_Duration_Minutes has 50 missing values and Data type: float64
Trip_Price has 49 missing values and Data type: float64


### 2.1 Handling missing values for numerical features using mean

In [75]:
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    # Using mean to fill the missing values in numerical features
    df[col] = df[col].fillna(df[col].mean())
    # Checking whether there's still missing values after filling with mean
    print(f"Missing values in {col} after handling {df[col].isnull().sum()}")

Missing values in Trip_Distance_km after handling 0
Missing values in Passenger_Count after handling 0
Missing values in Base_Fare after handling 0
Missing values in Per_Km_Rate after handling 0
Missing values in Per_Minute_Rate after handling 0
Missing values in Trip_Duration_Minutes after handling 0
Missing values in Trip_Price after handling 0


### 2.2 Handling missing values for categorical features using mode

In [76]:
for col in df.select_dtypes(include=['object']).columns:
    # Using mode to fill the missing values in numerical features
    df[col] = df[col].fillna(df[col].mode()[0])
    # Checking whether there's still missing values after filling with mean
    print(f"Missing values in {col} after handling {df[col].isnull().sum()}")

Missing values in Time_of_Day after handling 0
Missing values in Day_of_Week after handling 0
Missing values in Traffic_Conditions after handling 0
Missing values in Weather after handling 0


## 3. Encoding and Feature Engineering

In [77]:
for col in df.select_dtypes(include=['object']).columns:
    print(f"Unique values in {col} : {df[col].unique()}")

Unique values in Time_of_Day : ['Morning' 'Afternoon' 'Evening' 'Night']
Unique values in Day_of_Week : ['Weekday' 'Weekend']
Unique values in Traffic_Conditions : ['Low' 'High' 'Medium']
Unique values in Weather : ['Clear' 'Rain' 'Snow']


In [78]:
# Using label encoding technique to transform categorical values into numerical values

df['Time_of_Day'] = pd.to_numeric(df['Time_of_Day'].replace({'Morning':0, 'Afternoon': 1, 'Evening': 2, 'Night': 3}, regex=True), errors='coerce')

df['Day_of_Week'] = pd.to_numeric(df['Day_of_Week'].replace({'Weekday':0, 'Weekend': 1}, regex=True), errors='coerce')

df['Traffic_Conditions'] = pd.to_numeric(df['Traffic_Conditions'].replace({'Low':0, 'Medium': 1, 'High': 2}, regex=True), errors='coerce')

df['Weather'] = pd.to_numeric(df['Day_of_Week'].replace({'Clear':0, 'Rain': 1, 'Snow': 2}, regex=True), errors='coerce')

  df['Time_of_Day'] = pd.to_numeric(df['Time_of_Day'].replace({'Morning':0, 'Afternoon': 1, 'Evening': 2, 'Night': 3}, regex=True), errors='coerce')
  df['Day_of_Week'] = pd.to_numeric(df['Day_of_Week'].replace({'Weekday':0, 'Weekend': 1}, regex=True), errors='coerce')
  df['Traffic_Conditions'] = pd.to_numeric(df['Traffic_Conditions'].replace({'Low':0, 'Medium': 1, 'High': 2}, regex=True), errors='coerce')


In [79]:
df.dtypes

Trip_Distance_km         float64
Time_of_Day                int64
Day_of_Week                int64
Passenger_Count          float64
Traffic_Conditions         int64
Weather                    int64
Base_Fare                float64
Per_Km_Rate              float64
Per_Minute_Rate          float64
Trip_Duration_Minutes    float64
Trip_Price               float64
dtype: object

## 4. Training the model and exporting

In [80]:
# Selecting independent and target features for model training
features = ['Trip_Distance_km', 'Time_of_Day', 'Day_of_Week', 'Passenger_Count', 'Traffic_Conditions', 'Weather', 'Base_Fare', 'Per_Km_Rate', 'Per_Minute_Rate', 'Trip_Duration_Minutes']
X = df[features]
y = df['Trip_Price']

In [81]:
# Splitting the dataset into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

In [82]:
# Importing regression model to train our own ML model
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [83]:
RF_regressor = RandomForestRegressor(random_state=1)
DT_regressor = DecisionTreeRegressor(random_state=1)

In [84]:
# Train the two models
RF_regressor.fit(X_train, y_train)
DT_regressor.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Model Evaluation using Mean Absolute error

In [87]:
from sklearn.metrics import mean_absolute_error

RF_predictions = RF_regressor.predict(X_test)
DT_predictions = DT_regressor.predict(X_test)

mae_DT = mean_absolute_error(y_test, DT_predictions)
mae_RF = mean_absolute_error(y_test, RF_predictions)

print(f"Mean Absolute Error of Decision Tree Model = {mae_DT}")
print(f"Mean Absolute Error of Random Forest Model = {mae_RF}")

Mean Absolute Error of Decision Tree Model = 10.173477217649381
Mean Absolute Error of Random Forest Model = 6.632285762115304


As seen from the MAE values the Random Forest Model has higher accuracy therefore it will be saved for our application use

In [90]:
import joblib

model = "../Model/Taxi_Price_Prediction_Model.pkl"

joblib.dump(RF_regressor, model)

['../Model/Taxi_Price_Prediction_Model.pkl']