In [1]:
import os
import time
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import warnings
from sklearn.linear_model import Lasso
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
# Set random seed and display settings
np.random.seed(42)
plt.rcParams['figure.figsize'] = (16, 8)
import logging
from sklearn.exceptions import ConvergenceWarning  # Import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)  # Suppress convergence warnings

logging.getLogger("lightgbm").setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from IPython.display import clear_output, display

In [2]:
# Load dataset
file_path = os.path.join(os.getcwd(), "data", "dynamic_pricing.csv")
data = pd.read_csv(file_path)

In [3]:
data

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422
...,...,...,...,...,...,...,...,...,...,...
995,33,23,Urban,Gold,24,4.21,Morning,Premium,11,91.389526
996,84,29,Urban,Regular,92,4.55,Morning,Premium,94,424.155987
997,44,6,Suburban,Gold,80,4.13,Night,Premium,40,157.364830
998,53,27,Suburban,Regular,78,3.63,Night,Premium,58,279.095048


In [4]:
# Initial data overview
print("Dataset shape:", data.shape)
# Preprocess categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_columns.tolist())

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

Dataset shape: (1000, 10)
Categorical columns: ['Location_Category', 'Customer_Loyalty_Status', 'Time_of_Booking', 'Vehicle_Type']


In [5]:
new_categorical_columns = [col for col in data.columns if data[col].dtype == 'bool']

print("Categorical columns to convert:", new_categorical_columns)
# Convert only the categorical columns (boolean) to integers
data[new_categorical_columns] = data[new_categorical_columns].astype(int)
# Verify data types of the columns
print(data.dtypes)

data = data.sample(frac=1)

Categorical columns to convert: ['Location_Category_Suburban', 'Location_Category_Urban', 'Customer_Loyalty_Status_Regular', 'Customer_Loyalty_Status_Silver', 'Time_of_Booking_Evening', 'Time_of_Booking_Morning', 'Time_of_Booking_Night', 'Vehicle_Type_Premium']
Number_of_Riders                     int64
Number_of_Drivers                    int64
Number_of_Past_Rides                 int64
Average_Ratings                    float64
Expected_Ride_Duration               int64
Historical_Cost_of_Ride            float64
Location_Category_Suburban           int64
Location_Category_Urban              int64
Customer_Loyalty_Status_Regular      int64
Customer_Loyalty_Status_Silver       int64
Time_of_Booking_Evening              int64
Time_of_Booking_Morning              int64
Time_of_Booking_Night                int64
Vehicle_Type_Premium                 int64
dtype: object


In [6]:

# Prepare data
X = data.drop('Historical_Cost_of_Ride', axis=1)

y = data['Historical_Cost_of_Ride']#/np.max(data['Historical_Cost_of_Ride'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Define models and parameters
models_and_params = {
    "Lasso": (Lasso(fit_intercept=False), {"alpha": np.concatenate([np.logspace(-4, -1, 10), np.linspace(0.1, 1.0, 5)])}),

}

# Re-evaluate models with data and include MAE for test data
results = []
for name, (model, params) in models_and_params.items():
    try:
        grid_search = GridSearchCV(model, params, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)

        best_model = grid_search.best_estimator_
        best_score = -grid_search.best_score_  # Convert to positive
        y_test_pred = best_model.predict(X_test)

        # Calculate metrics
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)  # Test MAE
        test_r2 = r2_score(y_test, y_test_pred)

        # Extract CV MAE for all hyperparameter combinations
        cv_mae_all = -grid_search.cv_results_["mean_test_score"]  # Convert to positive MAE
        param_combinations = grid_search.cv_results_["params"]

        results.append({
            "Model": name,
            "Mean_CV_MAE": best_score,
            "Test_MAE": test_mae,
            "Test_MSE": test_mse,
            "Test_R2": test_r2,
            "Best_Params": grid_search.best_params_,
            "CV_MAE_All": list(zip(param_combinations, cv_mae_all)),  # Include all CV MAEs with hyperparameters
            "Best_Model": best_model
        })
    except Exception as e:
        print(f"Model {name} failed with error: {e}")

# Convert scaled results to DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Test_MAE")

# Display scaled results
print(results_df[["Model", "Mean_CV_MAE", "Test_MAE", "Test_MSE", "Test_R2","Best_Params"]])


# # Display scaled results
# print(results_df[["Best_Params"]])
# # Example: Access all CV MAE for the top model
# top_model_cv_mae = results_df.iloc[0]["CV_MAE_All"]
# print(f"\nAll CV MAE for top model ({results_df.iloc[0]['Model']}):")
# for params, cv_mae in top_model_cv_mae:
#     print(f"Params: {params}, CV MAE: {cv_mae}")

   Model  Mean_CV_MAE  Test_MAE     Test_MSE   Test_R2     Best_Params
0  Lasso    51.941255  49.49284  4218.253551  0.888173  {'alpha': 1.0}


In [7]:
# Extract the top-performing model
top_model_row = results_df.iloc[0]
top_model_name = top_model_row["Model"]
top_model = top_model_row["Best_Model"]

# Check if the model is linear (e.g., LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor)
if hasattr(top_model, "coef_") and hasattr(top_model, "intercept_"):
    coefficients = top_model.coef_  # Coefficients of the features
    intercept = top_model.intercept_  # Intercept of the equation

    # Get feature names from the dataset
    feature_names = X_train.columns

    # Construct the equation, excluding terms with zero coefficients
    terms = [f"{coef:.4f} * {feature}" for coef, feature in zip(coefficients, feature_names) if coef != 0]

    # Add intercept to the equation only if it's not zero
    if intercept != 0:
        equation = "y = " + " + ".join(terms) + f" + {intercept:.4f}"
    else:
        equation = "y = " + " + ".join(terms)

    # Print the best model name and equation
    print(f"Best Model: {top_model_name}")
    print("Best Equation:")
    print(equation)

else:
    print(f"The best model '{top_model_name}' does not provide coefficients. It may not be a linear model.")

Best Model: Lasso
Best Equation:
y = -0.1629 * Number_of_Riders + 0.4952 * Number_of_Drivers + 0.0186 * Number_of_Past_Rides + 3.5039 * Expected_Ride_Duration + -0.9645 * Location_Category_Urban + -1.3161 * Time_of_Booking_Evening + 38.8146 * Vehicle_Type_Premium


In [8]:
# Print non-zero coefficients with corresponding feature names
print("\nUseful Features (Non-zero Coefficients):")
non_zero_coefficients = [(coef, feature) for coef, feature in zip(coefficients, feature_names) if coef != 0]
for coef, feature in non_zero_coefficients:
    print(f"\t{feature}: {coef:.4f}")

# Print features with zero coefficients
print("\nNot Useful Features (Zero Coefficients):")
zero_coefficients = [(coef, feature) for coef, feature in zip(coefficients, feature_names) if coef == 0]
for _, feature in zero_coefficients:
    print(f"\t{feature}")

# Summary
print("\nSummary:")
print(f"\tTotal Features: {len(feature_names)}")
print(f"\tUseful Features: {len(non_zero_coefficients)}")
print(f"\tNot Useful Features: {len(zero_coefficients)}")


Useful Features (Non-zero Coefficients):
	Number_of_Riders: -0.1629
	Number_of_Drivers: 0.4952
	Number_of_Past_Rides: 0.0186
	Expected_Ride_Duration: 3.5039
	Location_Category_Urban: -0.9645
	Time_of_Booking_Evening: -1.3161
	Vehicle_Type_Premium: 38.8146

Not Useful Features (Zero Coefficients):
	Average_Ratings
	Location_Category_Suburban
	Customer_Loyalty_Status_Regular
	Customer_Loyalty_Status_Silver
	Time_of_Booking_Morning
	Time_of_Booking_Night

Summary:
	Total Features: 13
	Useful Features: 7
	Not Useful Features: 6


In [9]:
# Initialize variables to store results
train_sizes = np.linspace(50, len(X_train), 10, dtype=int)  # Vary training set size
mae_scores = []
r2_scores = []
coefficients = []

# Feature names from the DataFrame
feature_names = X.columns.tolist()

# Dynamic updating loop
for train_size in train_sizes:
    # Subsample training data
    X_train_sub = X_train.iloc[:train_size]
    y_train_sub = y_train[:train_size]
    # Train Lasso regressor
    lasso = Lasso(*dict(results_df["Best_Params"]))  # Adjust alpha as needed
    lasso.fit(X_train_sub, y_train_sub)

    # Predict and evaluate
    y_pred = lasso.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae_scores.append(mae)
    r2_scores.append(r2)
    coefficients.append(lasso.coef_)

    # Clear previous output
    clear_output(wait=True)

    # Create MAE plot
    fig_mae = go.Figure()
    fig_mae.add_trace(go.Scatter(
        x=train_sizes[:len(mae_scores)],
        y=mae_scores,
        mode='lines+markers',
        name='MAE',
        line=dict(color='blue')
    ))
    fig_mae.update_layout(
        title='Dynamic Effect of Training Size on MAE',
        xaxis_title='Training Size',
        yaxis_title='Mean Absolute Error (MAE)',
        legend_title='Metric'
    )

    # Create R² plot
    fig_r2 = go.Figure()
    fig_r2.add_trace(go.Scatter(
        x=train_sizes[:len(r2_scores)],
        y=r2_scores,
        mode='lines+markers',
        name='R² Score',
        line=dict(color='green')
    ))
    fig_r2.update_layout(
        title='Dynamic Effect of Training Size on R²',
        xaxis_title='Training Size',
        yaxis_title='R² Score',
        legend_title='Metric'
    )

    # Create Coefficients plot
    fig_coeff = go.Figure()
    coefficients_array = np.array(coefficients)
    for i, feature_name in enumerate(feature_names):
        fig_coeff.add_trace(go.Scatter(
            x=train_sizes[:len(mae_scores)],
            y=coefficients_array[:, i],
            mode='lines+markers',
            name=feature_name
        ))
    fig_coeff.update_layout(
        title='Dynamic Effect of Training Size on Coefficients',
        xaxis_title='Training Size',
        yaxis_title='Coefficient Values',
        legend_title='Feature Names'
    )

    # Display updated figures
    fig_mae.show()
    fig_r2.show()
    fig_coeff.show()

    time.sleep(1)