In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset from the previous step
# Replace 'bookings_corrected_datatypes.csv' with the actual file path if not in the same directory
file_path = 'bookings_corrected_datatypes.csv'
data_corrected = pd.read_csv(file_path)

In [None]:
# Create a copy of the dataset to preserve the loaded data
data_cleaned = data_corrected.copy()

In [None]:
# Step 1: Inspect numeric columns
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
print("Summary Statistics Before Handling Outliers:")
display(data_cleaned[numeric_columns].describe())

In [None]:
# Step 2: Handle outliers for numeric columns
for col in numeric_columns:
    if col in ['Driver_Ratings', 'Customer_Rating']:
        # Cap ratings between 0 and 5
        data_cleaned[col] = data_cleaned[col].clip(lower=0, upper=5)
        print(f"\nRatings capped between 0 and 5 for {col}")
    else:
        # Calculate Q1, Q3, and IQR for other numeric columns
        Q1 = data_cleaned[col].quantile(0.25)
        Q3 = data_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Cap outliers at 5th and 95th percentiles
        lower_cap = data_cleaned[col].quantile(0.05)
        upper_cap = data_cleaned[col].quantile(0.95)
        data_cleaned[col] = data_cleaned[col].clip(lower=lower_cap, upper=upper_cap)
        print(f"\nOutliers capped for {col}:")
        print(f"Lower cap (5th percentile): {lower_cap:.2f}")
        print(f"Upper cap (95th percentile): {upper_cap:.2f}")

In [None]:
# Step 3: Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver if they exist and are numeric
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_cleaned.columns:
        if data_cleaned[col].dtype == 'float64':
            Q1 = data_cleaned[col].quantile(0.25)
            Q3 = data_cleaned[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_cap = data_cleaned[col].quantile(0.05)
            upper_cap = data_cleaned[col].quantile(0.95)
            data_cleaned[col] = data_cleaned[col].clip(lower=lower_cap, upper=upper_cap)
            print(f"\nOutliers capped for {col}:")
            print(f"Lower cap (5th percentile): {lower_cap:.2f}")
            print(f"Upper cap (95th percentile): {upper_cap:.2f}")
        else:
            print(f"\nSkipping {col}: Contains non-numeric data (string).")

In [None]:
# Step 4: Validate outliers
print("\nSummary Statistics After Handling Outliers:")
display(data_cleaned[numeric_columns].describe())

In [None]:
# Step 5: Save the dataset with handled outliers
data_cleaned.to_csv('bookings_outliers_handled.csv', index=False)
print("Dataset with handled outliers saved as 'bookings_outliers_handled.csv'")