In [60]:
#Installing gradio for the User interface
!pip install gradio



In [109]:
#Import needed libratries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
import gradio as gr
import joblib
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/dynamic-pricing/dynamic_pricing.csv


In [62]:
#Loading the ride‑hailing dataset
data= pd.read_csv('/kaggle/input/dynamic-pricing/dynamic_pricing.csv')

In [63]:
data.head()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422


In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Number_of_Riders         1000 non-null   int64  
 1   Number_of_Drivers        1000 non-null   int64  
 2   Location_Category        1000 non-null   object 
 3   Customer_Loyalty_Status  1000 non-null   object 
 4   Number_of_Past_Rides     1000 non-null   int64  
 5   Average_Ratings          1000 non-null   float64
 6   Time_of_Booking          1000 non-null   object 
 7   Vehicle_Type             1000 non-null   object 
 8   Expected_Ride_Duration   1000 non-null   int64  
 9   Historical_Cost_of_Ride  1000 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 78.3+ KB


In [65]:
data.describe()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Number_of_Past_Rides,Average_Ratings,Expected_Ride_Duration,Historical_Cost_of_Ride
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,60.372,27.076,50.031,4.25722,99.588,372.502623
std,23.701506,19.068346,29.313774,0.435781,49.16545,187.158756
min,20.0,5.0,0.0,3.5,10.0,25.993449
25%,40.0,11.0,25.0,3.87,59.75,221.365202
50%,60.0,22.0,51.0,4.27,102.0,362.019426
75%,81.0,38.0,75.0,4.6325,143.0,510.497504
max,100.0,89.0,100.0,5.0,180.0,836.116419


In [66]:
#Getting the correlation between expected ride duration and historical cost of ride
(data['Expected_Ride_Duration'].corr(data['Historical_Cost_of_Ride']))

0.9275471833882493

The correlation value suggests that historically, ride prices are determined almost entirely by the ride duration

In [67]:
fig = px.scatter(data, x='Expected_Ride_Duration',
                 y='Historical_Cost_of_Ride', title= 'Expected Ride Duration vs. Historical Cost of Ride',
                trendline= 'ols')
fig.show()

The above visualization shows the relationship between ride duration and historical cost of ride and we can see a positive relationship that shows that longer duration translates into higher cost

In [68]:
fig = px.box(data, x= 'Vehicle_Type', y='Historical_Cost_of_Ride',
            title= 'Historical Cost of Ride Distribution by Vehicle Type')
fig.show()

The boxplot shows a difference in the cost of a ride based on vehicle type, and as expected a premium vehicle type means the customer will pay higher

In [69]:
#Extracting the numerical features from the dataset
data_numeric = data.select_dtypes(include= ['float64','int64'])

In [70]:
cor_matrix = data_numeric.corr()

In [71]:
#Plotting the correlation matrix
fig = go.Figure(data =go.Heatmap(z=cor_matrix.values,
                                x=cor_matrix.columns,
                                y= cor_matrix.columns, colorscale='Viridis'))
fig.update_layout(title='Correlation Matrix')
fig.show()

In [72]:
#setting high and low demand percentile
high_demand_percentile = 75
low_demand_percentile = 25

In [73]:
#Calculating demand multiplier based on the number of riders available
data['demand_multiplier'] = np.where(data['Number_of_Riders'] > np.percentile(data['Number_of_Riders'], high_demand_percentile),
                                    data['Number_of_Riders'] / np.percentile(data['Number_of_Riders'], high_demand_percentile),
                                    data['Number_of_Riders'] / np.percentile(data['Number_of_Riders'], low_demand_percentile))

In [74]:
high_supply_percentile = 75
low_supply_percentile = 25

In [75]:
#Calculating the supply multiplier based on the number of drivers available

data['supply_multiplier'] = np.where(data['Number_of_Drivers'] > np.percentile(data['Number_of_Drivers'], low_supply_percentile),
                                    np.percentile(data['Number_of_Drivers'], high_supply_percentile) / data['Number_of_Drivers'],
                                    np.percentile(data['Number_of_Drivers'],low_supply_percentile)/ data['Number_of_Drivers'])

In [76]:
#Setting the threshold for both demand and supply multiplier
demand_threshold_high = 1.2
demand_threshold_low = 0.8
supply_threshold_high = 0.8
supply_threshold_low = 1.2

In [77]:
#Adjusting ride cost based on demand and suply dynamics
data['adjusted_ride_cost'] = data['Historical_Cost_of_Ride'] * (
    np.maximum(data['demand_multiplier'], demand_threshold_low) *
    np.maximum(data['supply_multiplier'], supply_threshold_high)
)

In [78]:
#Calculating the change in profit between static and dynamic pricing
data['profit_percentage'] = ((data['adjusted_ride_cost']- data['Historical_Cost_of_Ride'])/ data['Historical_Cost_of_Ride'])* 100

In [79]:
#Getting profit and loss due to dynamic pricing
profitable_rides = data[data['profit_percentage'] > 0]

loss_rides = data[data['profit_percentage'] < 0]

In [80]:
profitable_count = len(profitable_rides)

loss_count = len(loss_rides)

In [81]:
loss_count

173

In [82]:
profitable_count

826

In [83]:
labels = ['Profitable Rides', 'Loss Rides']
values = [profitable_count,loss_count]

In [84]:
fig = go.Figure(data= [go.Pie(labels=labels,values=values, hole=0.4)])
fig.update_layout(title='Profitability of Rides (Dynamic Pricing vs. Historical Pricing)')
fig.show()

The plot above shows that implementing a dynamic pricing strategy based on customer behaviour increases the profit 82.7% of the time.

In [85]:
fig = px.scatter(data, x='Expected_Ride_Duration', y ='adjusted_ride_cost',
                title= 'Expected Ride Duration vs. Cost of Ride', trendline='ols')
fig.show()

Now we can see from the above plot that duration is not the sole determinant of ride price

# Training a Predictive Model

In [86]:
#Defining a data preprocessing function to be applied before training a model on the dataset
def data_preprocessing_pipeline(data):
    #Identify numeric and categorical features
    numeric_features = data.select_dtypes(include = ['int','float']).columns
    categorical_features = data.select_dtypes(include=['object']).columns
    #Handling missing data in numeric features
    data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean())
    #Detect and handle outliers in numeric features using IQR
    for feature in numeric_features:
        Q1= data[feature].quantile(0.25)
        Q3 = data[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (1.5 * IQR)
        upper_bound = Q3 + (1.5 * IQR)
        data[feature] = np.where((data[feature] < lower_bound) | (data[feature] > upper_bound),
                                 data[feature].mean(), data[feature])
        data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])

    return data

In [87]:
#Changing the vehicle type column to numeric
data['Vehicle_Type'] = data['Vehicle_Type'].map({'Premium':1,'Economy':0})

In [88]:
#Defining the predictor variables
x = np.array(data[["Number_of_Riders", "Number_of_Drivers", "Vehicle_Type", "Expected_Ride_Duration"]])

In [89]:
#Defining the target variable
y = np.array(data[["adjusted_ride_cost"]])

In [90]:
np.isnan(x).any()

False

In [91]:
# Splitting the dataset into test and training set
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size= 0.2, random_state = 42)

In [92]:
np.isnan(X_train).any()

False

In [93]:
#Reshape y to 1D array
y_train = y_train.ravel()
y_test = y_test.ravel()

In [94]:
#Definning the random forest model
model = RandomForestRegressor()

In [95]:
#Fitting the training data on the model
model.fit(X_train,y_train)

In [96]:
#Funtion to convert vehicle type to numeric
def get_vehicle_type(vehicle_type):
    vehicle_type_mapping = {"Premium":1,'Economy':0}
    vehicle_type_numeric = vehicle_type_mapping.get(vehicle_type)
    return vehicle_type_numeric

In [105]:
#Function to predict ride price with the random forest regressor
def predict_price(number_of_riders, number_of_drivers, vehicle_type, Expected_Ride_Duration):
    vehicle_type_numeric = get_vehicle_type(vehicle_type)
    if vehicle_type_numeric is None:
        raise ValueError("Invalid Vehicle Type")
    input_data = np.array([[number_of_riders, number_of_drivers, vehicle_type_numeric, Expected_Ride_Duration]])
    predict_price = model.predict(input_data)
    return predict_price

In [106]:
# Example prediction using user input values
user_number_of_riders = 50
user_number_of_drivers = 25
user_vehicle_type = "Economy"
Expected_Ride_Duration = 30
predicted_price = predict_price(user_number_of_riders, user_number_of_drivers, user_vehicle_type, Expected_Ride_Duration)
print("Predicted price:", predicted_price)

Predicted price: [245.87407338]


Plotting the actual prices and model predicted prices

In [115]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test.flatten(), y= y_pred,mode= 'markers', name='Actual vs. Predicted'))
# Add a line representing the ideal case
fig.add_trace(go.Scatter(
    x=[min(y_test.flatten()), max(y_test.flatten())],
    y=[min(y_test.flatten()), max(y_test.flatten())],
    mode='lines',
    name='Ideal',
    line=dict(color='red', dash='dash')
))

fig.update_layout(
    title='Actual vs Predicted Values',
    xaxis_title='Actual Values',
    yaxis_title='Predicted Values',
    showlegend=True,
)

fig.show()

In [101]:
#preparing the model for download
joblib.dump(model, 'ride_model.pkl')

['ride_model.pkl']

In [116]:
# loading the model
ride_model= joblib.load('ride_model.pkl')

In [103]:
#Function to predict ride price dynamically
def predict_ride_price(user_number_of_riders,user_number_of_drivers,user_vehicle_type,Expected_Ride_Duration):
    try:
        user_vehicle_type = get_vehicle_type(user_vehicle_type)
        features = pd.DataFrame([[user_number_of_riders,user_number_of_drivers,user_vehicle_type,Expected_Ride_Duration]],
                                columns=['user_number_of_riders','user_number_of_drivers','user_vehicle_type','Expected_Ride_Duration'])
        prediction = ride_model.predict(features)[0]
        return f" Predicted Ride Price for input details: {prediction:.2f}"
    except Exception as e:
        return f" Error: {str(e)}"

# Building an interface to test the model with gradio

In [117]:
if __name__ == "__main__":
    demo = gr.Interface(
    fn=predict_ride_price,
    inputs=[
        gr.Number(label="Number of Riders (Integer)"),
        gr.Number(label="Number of Drivers (Integer)"),
        gr.Dropdown(
            choices=["Economy", "Premium"],
            label="Vehicle Type"
        ),
        gr.Number(label="Expected Ride Duration (mins)")
    ],
    outputs="text",
    title="Ride Price Prediction",
    description="Enter ride details to get the predicted price."
)

demo.launch()


* Running on local URL:  http://127.0.0.1:7861
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://0c3323b53ef620a557.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


