In [1]:
#step 1: installing all the recommended library
!pip install pandas numpy matplotlib seaborn openpyxl

Defaulting to user installation because normal site-packages is not writeable


In [3]:
# Step 2: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Step 3: Load the dataset (July sheet from Bookings.xlsx)
file_path = 'Bookings.xlsx'
data = pd.read_excel(file_path, sheet_name='July', engine='openpyxl')

In [5]:
# Step 4: Inspect the dataset
# Display the first 5 rows
print("First 5 rows of the dataset:")
display(data.head())

# Display basic information about the dataset (data types, non-null counts)
print("\nDataset Info:")
data.info()

# Display summary statistics for numeric columns
print("\nSummary Statistics:")
display(data.describe())

# Check for missing values
print("\nMissing Values:")
display(data.isnull().sum())

# Check for duplicate rows
print("\nNumber of Duplicate Rows:")
display(data.duplicated().sum())

First 5 rows of the dataset:


Unnamed: 0,Date,Time,Booking_ID,Booking_Status,Customer_ID,Vehicle_Type,Pickup_Location,Drop_Location,V_TAT,C_TAT,Canceled_Rides_by_Customer,Canceled_Rides_by_Driver,Incomplete_Rides,Incomplete_Rides_Reason,Booking_Value,Payment_Method,Ride_Distance,Driver_Ratings,Customer_Rating,Vehicle Images
0,2024-07-26 14:00:00,14:00:00,CNR7153255142,Canceled by Driver,CID713523,Prime Sedan,Tumkur Road,RT Nagar,,,,Personal & Car related issue,,,444,,0,,,https://cdn-icons-png.flaticon.com/128/14183/1...
1,2024-07-25 22:20:00,22:20:00,CNR2940424040,Success,CID225428,Bike,Magadi Road,Varthur,203.0,30.0,,,No,,158,Cash,13,4.1,4.0,https://cdn-icons-png.flaticon.com/128/9983/99...
2,2024-07-30 19:59:00,19:59:00,CNR2982357879,Success,CID270156,Prime SUV,Sahakar Nagar,Varthur,238.0,130.0,,,No,,386,UPI,40,4.2,4.8,https://cdn-icons-png.flaticon.com/128/9983/99...
3,2024-07-22 03:15:00,03:15:00,CNR2395710036,Canceled by Customer,CID581320,eBike,HSR Layout,Vijayanagar,,,Driver is not moving towards pickup location,,,,384,,0,,,https://cdn-icons-png.flaticon.com/128/6839/68...
4,2024-07-02 09:02:00,09:02:00,CNR1797421769,Success,CID939555,Mini,Rajajinagar,Chamarajpet,252.0,80.0,,,No,,822,Credit Card,45,4.0,3.0,https://cdn-icons-png.flaticon.com/128/3202/32...



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103024 entries, 0 to 103023
Data columns (total 20 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   Date                        103024 non-null  datetime64[ns]
 1   Time                        103024 non-null  object        
 2   Booking_ID                  103024 non-null  object        
 3   Booking_Status              103024 non-null  object        
 4   Customer_ID                 103024 non-null  object        
 5   Vehicle_Type                103024 non-null  object        
 6   Pickup_Location             103024 non-null  object        
 7   Drop_Location               103024 non-null  object        
 8   V_TAT                       63967 non-null   float64       
 9   C_TAT                       63967 non-null   float64       
 10  Canceled_Rides_by_Customer  10499 non-null   object        
 11  Canceled_Rides_by_Driver

Unnamed: 0,Date,V_TAT,C_TAT,Booking_Value,Ride_Distance,Driver_Ratings,Customer_Rating
count,103024,63967.0,63967.0,103024.0,103024.0,63967.0,63967.0
mean,2024-07-16 11:31:38.879678720,170.876952,84.873372,548.751883,14.189927,3.997457,3.998313
min,2024-07-01 00:00:00,35.0,25.0,100.0,0.0,3.0,3.0
25%,2024-07-08 18:41:00,98.0,55.0,242.0,0.0,3.5,3.5
50%,2024-07-16 11:23:00,168.0,85.0,386.0,8.0,4.0,4.0
75%,2024-07-24 05:18:00,238.0,115.0,621.0,26.0,4.5,4.5
max,2024-07-31 23:58:00,308.0,145.0,2999.0,49.0,5.0,5.0
std,,80.80364,36.0051,536.541221,15.77627,0.576834,0.578957



Missing Values:


Date                              0
Time                              0
Booking_ID                        0
Booking_Status                    0
Customer_ID                       0
Vehicle_Type                      0
Pickup_Location                   0
Drop_Location                     0
V_TAT                         39057
C_TAT                         39057
Canceled_Rides_by_Customer    92525
Canceled_Rides_by_Driver      84590
Incomplete_Rides              39057
Incomplete_Rides_Reason       99098
Booking_Value                     0
Payment_Method                39057
Ride_Distance                     0
Driver_Ratings                39057
Customer_Rating               39057
Vehicle Images                    0
dtype: int64


Number of Duplicate Rows:


np.int64(0)

In [6]:
# Create a copy of the dataset to preserve the original
data_cleaned = data.copy()

In [7]:
# Step 5: Handle missing values for numeric columns
# For non-successful rides (Canceled by Customer, Canceled by Driver, Driver Not Found), impute with 0
non_successful_status = ['Canceled by Customer', 'Canceled by Driver', 'Driver Not Found']
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']

for col in numeric_columns:
    # Impute 0 for non-successful rides
    data_cleaned.loc[data_cleaned['Booking_Status'].isin(non_successful_status), col] = data_cleaned.loc[data_cleaned['Booking_Status'].isin(non_successful_status), col].fillna(0)
    # For successful rides, impute with median (if any missing values remain)
    if data_cleaned[col].isnull().sum() > 0:
        median_value = data_cleaned.loc[data_cleaned['Booking_Status'] == 'Success', col].median()
        data_cleaned[col].fillna(median_value, inplace=True)

In [8]:
# Step 6: Handle missing values for categorical columns
# Impute 'None' for Payment_Method in non-successful rides
data_cleaned.loc[data_cleaned['Booking_Status'].isin(non_successful_status), 'Payment_Method'] = data_cleaned.loc[data_cleaned['Booking_Status'].isin(non_successful_status), 'Payment_Method'].fillna('None')

# Impute 'None' for Incomplete_Rides_Reason in successful rides
data_cleaned.loc[data_cleaned['Booking_Status'] == 'Success', 'Incomplete_Rides_Reason'] = data_cleaned.loc[data_cleaned['Booking_Status'] == 'Success', 'Incomplete_Rides_Reason'].fillna('None')

In [9]:
# Step 7: Handle Incomplete_Rides (boolean-like)
# Impute 'No' for successful rides, 'Yes' for non-successful rides with a reason
data_cleaned.loc[data_cleaned['Booking_Status'] == 'Success', 'Incomplete_Rides'] = data_cleaned.loc[data_cleaned['Booking_Status'] == 'Success', 'Incomplete_Rides'].fillna('No')
data_cleaned.loc[data_cleaned['Incomplete_Rides_Reason'].notnull(), 'Incomplete_Rides'] = data_cleaned.loc[data_cleaned['Incomplete_Rides_Reason'].notnull(), 'Incomplete_Rides'].fillna('Yes')

In [10]:
# Step 8: Drop redundant columns (if entirely null or not needed)
# Check if Canceled_Rides_by_Customer and Canceled_Rides_by_Driver are entirely null
if data_cleaned['Canceled_Rides_by_Customer'].isnull().all() and data_cleaned['Canceled_Rides_by_Driver'].isnull().all():
    data_cleaned.drop(columns=['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver'], inplace=True)

In [11]:
# Step 9: Validate missing values
print("Missing Values After Handling:")
display(data_cleaned.isnull().sum())

Missing Values After Handling:


Date                              0
Time                              0
Booking_ID                        0
Booking_Status                    0
Customer_ID                       0
Vehicle_Type                      0
Pickup_Location                   0
Drop_Location                     0
V_TAT                             0
C_TAT                             0
Canceled_Rides_by_Customer    92525
Canceled_Rides_by_Driver      84590
Incomplete_Rides              39057
Incomplete_Rides_Reason       39057
Booking_Value                     0
Payment_Method                    0
Ride_Distance                     0
Driver_Ratings                    0
Customer_Rating                   0
Vehicle Images                    0
dtype: int64

In [12]:
# Step 10: Save the cleaned dataset
data_cleaned.to_csv('cleaned_bookings.csv', index=False)
print("Cleaned dataset saved as 'cleaned_bookings.csv'")

Cleaned dataset saved as 'cleaned_bookings.csv'


In [13]:
# Load the cleaned dataset from the previous step
# Replace 'cleaned_bookings.csv' with the actual file path if not in the same directory
file_path = 'cleaned_bookings.csv'
data_cleaned = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_no_duplicates = data_cleaned.copy()

In [14]:
# Step 11: Check for duplicates based on all columns
print("Number of Duplicate Rows (All Columns):")
display(data_no_duplicates.duplicated().sum())

Number of Duplicate Rows (All Columns):


np.int64(0)

In [15]:
# Step 12: Check for duplicates based on Booking_ID (should be unique)
print("\nNumber of Duplicate Booking_IDs:")
display(data_no_duplicates.duplicated(subset=['Booking_ID']).sum())


Number of Duplicate Booking_IDs:


np.int64(0)

In [17]:
# Step 13: Remove duplicates based on all columns, keeping the first occurrence
data_no_duplicates.drop_duplicates(keep='first', inplace=True)

In [18]:
# Step 14: Validate duplicates after removal
print("\nNumber of Duplicate Rows After Removal (All Columns):")
display(data_no_duplicates.duplicated().sum())

print("\nNumber of Duplicate Booking_IDs After Removal:")
display(data_no_duplicates.duplicated(subset=['Booking_ID']).sum())


Number of Duplicate Rows After Removal (All Columns):


np.int64(0)


Number of Duplicate Booking_IDs After Removal:


np.int64(0)

In [19]:
# Step 15: Display dataset size before and after
print("\nDataset Size Before Duplicate Removal:", len(data_cleaned))
print("Dataset Size After Duplicate Removal:", len(data_no_duplicates))


Dataset Size Before Duplicate Removal: 103024
Dataset Size After Duplicate Removal: 103024


In [20]:
# Step 16: Save the dataset without duplicates
data_no_duplicates.to_csv('bookings_no_duplicates.csv', index=False)
print("Dataset without duplicates saved as 'bookings_no_duplicates.csv'")

Dataset without duplicates saved as 'bookings_no_duplicates.csv'


In [21]:
# Load the dataset from the previous step
# Replace 'bookings_no_duplicates.csv' with the actual file path if not in the same directory
file_path = 'bookings_no_duplicates.csv'
data_no_duplicates = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_corrected = data_no_duplicates.copy()

In [27]:
# Step 17: Inspect current data types
print("Current Data Types:")
display(data_corrected.dtypes)
print("\nSample Values for All Columns:")
display(data_corrected.head())
print("\nSample Values for Canceled_Rides_by_Customer and Canceled_Rides_by_Driver:")
display(data_corrected[['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']].head(10))

Current Data Types:


Booking_ID                    string[python]
Booking_Status                      category
Customer_ID                   string[python]
Vehicle_Type                        category
Pickup_Location               string[python]
Drop_Location                 string[python]
V_TAT                                float64
C_TAT                                float64
Canceled_Rides_by_Customer            object
Canceled_Rides_by_Driver              object
Incomplete_Rides                    category
Incomplete_Rides_Reason       string[python]
Booking_Value                        float64
Payment_Method                      category
Ride_Distance                        float64
Driver_Ratings                       float64
Customer_Rating                      float64
Vehicle Images                string[python]
Datetime                      datetime64[ns]
dtype: object


Sample Values for All Columns:


Unnamed: 0,Booking_ID,Booking_Status,Customer_ID,Vehicle_Type,Pickup_Location,Drop_Location,V_TAT,C_TAT,Canceled_Rides_by_Customer,Canceled_Rides_by_Driver,Incomplete_Rides,Incomplete_Rides_Reason,Booking_Value,Payment_Method,Ride_Distance,Driver_Ratings,Customer_Rating,Vehicle Images,Datetime
0,CNR7153255142,Canceled by Driver,CID713523,Prime Sedan,Tumkur Road,RT Nagar,0.0,0.0,,Personal & Car related issue,,,444.0,,0.0,0.0,0.0,https://cdn-icons-png.flaticon.com/128/14183/1...,2024-07-26 14:00:00
1,CNR2940424040,Success,CID225428,Bike,Magadi Road,Varthur,203.0,30.0,,,No,,158.0,Cash,13.0,4.1,4.0,https://cdn-icons-png.flaticon.com/128/9983/99...,2024-07-25 22:20:00
2,CNR2982357879,Success,CID270156,Prime SUV,Sahakar Nagar,Varthur,238.0,130.0,,,No,,386.0,UPI,40.0,4.2,4.8,https://cdn-icons-png.flaticon.com/128/9983/99...,2024-07-30 19:59:00
3,CNR2395710036,Canceled by Customer,CID581320,eBike,HSR Layout,Vijayanagar,0.0,0.0,Driver is not moving towards pickup location,,,,384.0,,0.0,0.0,0.0,https://cdn-icons-png.flaticon.com/128/6839/68...,2024-07-22 03:15:00
4,CNR1797421769,Success,CID939555,Mini,Rajajinagar,Chamarajpet,252.0,80.0,,,No,,822.0,Credit Card,45.0,4.0,3.0,https://cdn-icons-png.flaticon.com/128/3202/32...,2024-07-02 09:02:00



Sample Values for Canceled_Rides_by_Customer and Canceled_Rides_by_Driver:


Unnamed: 0,Canceled_Rides_by_Customer,Canceled_Rides_by_Driver
0,,Personal & Car related issue
1,,
2,,
3,Driver is not moving towards pickup location,
4,,
5,,
6,,
7,,Personal & Car related issue
8,,
9,,Personal & Car related issue


In [29]:
# Step 18: Handle Date and Time columns
if 'Datetime' in data_corrected.columns:
    # If Datetime exists, ensure it's datetime64
    data_corrected['Datetime'] = pd.to_datetime(data_corrected['Datetime'], errors='coerce')
    print("\nDatetime column already exists and is converted to datetime64.")
elif 'Date' in data_corrected.columns and 'Time' in data_corrected.columns:
    # If Date and Time exist, convert to Datetime
    if data_corrected['Date'].dtype == 'object' or data_corrected['Date'].dtype == 'datetime64[ns]':
        data_corrected['Datetime'] = pd.to_datetime(data_corrected['Date'], errors='coerce')
    else:
        # Fallback for Excel numeric format
        data_corrected['Datetime'] = pd.to_datetime(data_corrected['Date'], unit='d', origin='1899-12-30') + pd.to_timedelta(data_corrected['Time'], unit='d')
    data_corrected.drop(columns=['Date', 'Time'], inplace=True)
    print("\nDate and Time columns combined into Datetime.")
else:
    print("\nWarning: Neither 'Datetime' nor 'Date'/'Time' columns found. Please check the dataset.")


Datetime column already exists and is converted to datetime64.


In [30]:
# Step 19: Convert columns to appropriate data types
# String columns
string_columns = ['Booking_ID', 'Customer_ID', 'Pickup_Location', 'Drop_Location', 'Incomplete_Rides_Reason', 'Vehicle Images']
for col in string_columns:
    if col in data_corrected.columns:
        data_corrected[col] = data_corrected[col].astype('string')

# Categorical columns
categorical_columns = ['Booking_Status', 'Vehicle_Type', 'Payment_Method', 'Incomplete_Rides']
for col in categorical_columns:
    if col in data_corrected.columns:
        data_corrected[col] = data_corrected[col].astype('category')

# Numeric columns
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
for col in numeric_columns:
    if col in data_corrected.columns:
        data_corrected[col] = pd.to_numeric(data_corrected[col], errors='coerce').astype('float64')

# Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver if they exist
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_corrected.columns:
        # Check if the column contains non-numeric data
        if data_corrected[col].dtype == 'object' and data_corrected[col].str.contains(r'[a-zA-Z]', na=False).any():
            print(f"\nWarning: {col} contains text data. Converting to string instead of float.")
            data_corrected[col] = data_corrected[col].astype('string')
        else:
            data_corrected[col] = pd.to_numeric(data_corrected[col], errors='coerce').astype('float64')





In [31]:
# Step 19: Validate data types
print("\nData Types After Correction:")
display(data_corrected.dtypes)


Data Types After Correction:


Booking_ID                    string[python]
Booking_Status                      category
Customer_ID                   string[python]
Vehicle_Type                        category
Pickup_Location               string[python]
Drop_Location                 string[python]
V_TAT                                float64
C_TAT                                float64
Canceled_Rides_by_Customer    string[python]
Canceled_Rides_by_Driver      string[python]
Incomplete_Rides                    category
Incomplete_Rides_Reason       string[python]
Booking_Value                        float64
Payment_Method                      category
Ride_Distance                        float64
Driver_Ratings                       float64
Customer_Rating                      float64
Vehicle Images                string[python]
Datetime                      datetime64[ns]
dtype: object

In [32]:
# Step 21: Save the dataset with corrected data types
data_corrected.to_csv('bookings_corrected_datatypes.csv', index=False)
print("Dataset with corrected data types saved as 'bookings_corrected_datatypes.csv'")

Dataset with corrected data types saved as 'bookings_corrected_datatypes.csv'


In [33]:
# Load the dataset from the previous step
# Replace 'bookings_corrected_datatypes.csv' with the actual file path if not in the same directory
file_path = 'bookings_corrected_datatypes.csv'
data_corrected = pd.read_csv(file_path)

In [34]:
# Create a copy of the dataset to preserve the loaded data
data_cleaned = data_corrected.copy()

In [35]:
# Step 22: Inspect numeric columns
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
print("Summary Statistics Before Handling Outliers:")
display(data_cleaned[numeric_columns].describe())

Summary Statistics Before Handling Outliers:


Unnamed: 0,V_TAT,C_TAT,Booking_Value,Ride_Distance,Driver_Ratings,Customer_Rating
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,106.096502,52.697381,548.751883,14.189927,2.481997,2.482529
std,104.532203,50.00509,536.541221,15.77627,1.991983,1.99277
min,0.0,0.0,100.0,0.0,0.0,0.0
25%,0.0,0.0,242.0,0.0,0.0,0.0
50%,84.0,45.0,386.0,8.0,3.4,3.4
75%,196.0,95.0,621.0,26.0,4.2,4.2
max,308.0,145.0,2999.0,49.0,5.0,5.0


In [36]:
# Step 23: Handle outliers for numeric columns
for col in numeric_columns:
    if col in ['Driver_Ratings', 'Customer_Rating']:
        # Cap ratings between 0 and 5
        data_cleaned[col] = data_cleaned[col].clip(lower=0, upper=5)
        print(f"\nRatings capped between 0 and 5 for {col}")
    else:
        # Calculate Q1, Q3, and IQR for other numeric columns
        Q1 = data_cleaned[col].quantile(0.25)
        Q3 = data_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Cap outliers at 5th and 95th percentiles
        lower_cap = data_cleaned[col].quantile(0.05)
        upper_cap = data_cleaned[col].quantile(0.95)
        data_cleaned[col] = data_cleaned[col].clip(lower=lower_cap, upper=upper_cap)
        print(f"\nOutliers capped for {col}:")
        print(f"Lower cap (5th percentile): {lower_cap:.2f}")
        print(f"Upper cap (95th percentile): {upper_cap:.2f}")


Outliers capped for V_TAT:
Lower cap (5th percentile): 0.00
Upper cap (95th percentile): 287.00

Outliers capped for C_TAT:
Lower cap (5th percentile): 0.00
Upper cap (95th percentile): 135.00

Outliers capped for Booking_Value:
Lower cap (5th percentile): 128.00
Upper cap (95th percentile): 1899.00

Outliers capped for Ride_Distance:
Lower cap (5th percentile): 0.00
Upper cap (95th percentile): 45.00

Ratings capped between 0 and 5 for Driver_Ratings

Ratings capped between 0 and 5 for Customer_Rating


In [37]:
# Step 24: Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver if they exist and are numeric
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_cleaned.columns:
        if data_cleaned[col].dtype == 'float64':
            Q1 = data_cleaned[col].quantile(0.25)
            Q3 = data_cleaned[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_cap = data_cleaned[col].quantile(0.05)
            upper_cap = data_cleaned[col].quantile(0.95)
            data_cleaned[col] = data_cleaned[col].clip(lower=lower_cap, upper=upper_cap)
            print(f"\nOutliers capped for {col}:")
            print(f"Lower cap (5th percentile): {lower_cap:.2f}")
            print(f"Upper cap (95th percentile): {upper_cap:.2f}")
        else:
            print(f"\nSkipping {col}: Contains non-numeric data (string).")


Skipping Canceled_Rides_by_Customer: Contains non-numeric data (string).

Skipping Canceled_Rides_by_Driver: Contains non-numeric data (string).


In [38]:
# Step 25: Validate outliers
print("\nSummary Statistics After Handling Outliers:")
display(data_cleaned[numeric_columns].describe())


Summary Statistics After Handling Outliers:


Unnamed: 0,V_TAT,C_TAT,Booking_Value,Ride_Distance,Driver_Ratings,Customer_Rating
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,105.441984,52.323876,521.891093,14.083777,2.481997,2.482529
std,103.33921,49.353533,437.960673,15.557015,1.991983,1.99277
min,0.0,0.0,128.0,0.0,0.0,0.0
25%,0.0,0.0,242.0,0.0,0.0,0.0
50%,84.0,45.0,386.0,8.0,3.4,3.4
75%,196.0,95.0,621.0,26.0,4.2,4.2
max,287.0,135.0,1899.0,45.0,5.0,5.0


In [39]:
# Step 26: Save the dataset with handled outliers
data_cleaned.to_csv('bookings_outliers_handled.csv', index=False)
print("Dataset with handled outliers saved as 'bookings_outliers_handled.csv'")

Dataset with handled outliers saved as 'bookings_outliers_handled.csv'


In [41]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp313-cp313-win_amd64.whl (10.7 MB)
   ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
   - -------------------------------------- 0.5/10.7 MB 3.6 MB/s eta 0:00:03
   ---- ----------------------------------- 1.3/10.7 MB 3.6 MB/s eta 0:00:03
   ------- -------------------------------- 2.1/10.7 MB 3.6 MB/s eta 0:00:03
   ---------- ----------------------------- 2.9/10.7 MB 3.5 MB/s eta 0:00:03
   ------------- -------------------

In [42]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load the dataset from the previous step
# Replace 'bookings_outliers_handled.csv' with the actual file path if not in the same directory
file_path = 'bookings_outliers_handled.csv'
data_cleaned = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_processed = data_cleaned.copy()

In [43]:
# Step 27: Inspect numeric columns
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
print("Summary Statistics Before Standardization/Normalization:")
display(data_processed[numeric_columns].describe())

Summary Statistics Before Standardization/Normalization:


Unnamed: 0,V_TAT,C_TAT,Booking_Value,Ride_Distance,Driver_Ratings,Customer_Rating
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,105.441984,52.323876,521.891093,14.083777,2.481997,2.482529
std,103.33921,49.353533,437.960673,15.557015,1.991983,1.99277
min,0.0,0.0,128.0,0.0,0.0,0.0
25%,0.0,0.0,242.0,0.0,0.0,0.0
50%,84.0,45.0,386.0,8.0,3.4,3.4
75%,196.0,95.0,621.0,26.0,4.2,4.2
max,287.0,135.0,1899.0,45.0,5.0,5.0


In [44]:
# Step 28: Initialize scalers
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

In [45]:
# Step 29: Standardize and normalize numeric columns
for col in numeric_columns:
    # Standardize: Create new column with _std suffix
    data_processed[f'{col}_std'] = standard_scaler.fit_transform(data_processed[[col]])
    # Normalize: Create new column with _norm suffix
    data_processed[f'{col}_norm'] = minmax_scaler.fit_transform(data_processed[[col]])

In [46]:
# Step 30: Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver if they exist and are numeric
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_processed.columns and data_processed[col].dtype == 'float64':
        data_processed[f'{col}_std'] = standard_scaler.fit_transform(data_processed[[col]])
        data_processed[f'{col}_norm'] = minmax_scaler.fit_transform(data_processed[[col]])
        print(f"\nStandardized and normalized {col}")
    elif col in data_processed.columns:
        print(f"\nSkipping {col}: Contains non-numeric data (string).")


Skipping Canceled_Rides_by_Customer: Contains non-numeric data (string).

Skipping Canceled_Rides_by_Driver: Contains non-numeric data (string).


In [49]:
# Step 31: Validate transformations
print("\nSummary Statistics for Standardized Columns:")
std_columns = [f'{col}_std' for col in numeric_columns]
display(data_processed[std_columns].describe())

print("\nSummary Statistics for Normalized Columns:")
norm_columns = [f'{col}_norm' for col in numeric_columns]
display(data_processed[norm_columns].describe())


Summary Statistics for Standardized Columns:


Unnamed: 0,V_TAT_std,C_TAT_std,Booking_Value_std,Ride_Distance_std,Driver_Ratings_std,Customer_Rating_std
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,-6.910660000000001e-17,-1.834566e-17,-5.903717000000001e-17,-1.006942e-17,3.0828990000000003e-17,6.627888e-17
std,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005
min,-1.020353,-1.06019,-0.8993799,-0.9053051,-1.245999,-1.245774
25%,-1.020353,-1.06019,-0.6390812,-0.9053051,-1.245999,-1.245774
50%,-0.2074923,-0.1483969,-0.310283,-0.3910651,0.4608509,0.460402
75%,0.8763223,0.8647067,0.2262975,0.7659748,0.8624627,0.8618552
max,1.756922,1.67519,3.144382,1.987295,1.264075,1.263308



Summary Statistics for Normalized Columns:


Unnamed: 0,V_TAT_norm,C_TAT_norm,Booking_Value_norm,Ride_Distance_norm,Driver_Ratings_norm,Customer_Rating_norm
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,0.367394,0.387584,0.222412,0.312973,0.496399,0.496506
std,0.360067,0.365582,0.247296,0.345711,0.398397,0.398554
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.06437,0.0,0.0,0.0
50%,0.292683,0.333333,0.14568,0.177778,0.68,0.68
75%,0.682927,0.703704,0.278374,0.577778,0.84,0.84
max,1.0,1.0,1.0,1.0,1.0,1.0


In [50]:
# Step 32: Save the dataset with standardized and normalized columns
data_processed.to_csv('bookings_standardized_normalized.csv', index=False)
print("Dataset with standardized and normalized columns saved as 'bookings_standardized_normalized.csv'")

Dataset with standardized and normalized columns saved as 'bookings_standardized_normalized.csv'


In [51]:
from sklearn.preprocessing import LabelEncoder

# Load the dataset from the previous step
# Replace 'bookings_standardized_normalized.csv' with the actual file path if not in the same directory
file_path = 'bookings_standardized_normalized.csv'
data_processed = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_fe = data_processed.copy()

In [52]:
# Step 33: Temporal Features from Datetime
# Ensure Datetime is in datetime64 format
data_fe['Datetime'] = pd.to_datetime(data_fe['Datetime'], errors='coerce')

# Extract Hour, Day of Week, and Is_Weekend
data_fe['Hour'] = data_fe['Datetime'].dt.hour
data_fe['Day_of_Week'] = data_fe['Datetime'].dt.day_name()
data_fe['Is_Weekend'] = data_fe['Datetime'].dt.dayofweek >= 5  # Saturday (5) or Sunday (6)

# Create Time_of_Day (Morning, Afternoon, Evening, Night)
bins = [0, 6, 12, 18, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening']
data_fe['Time_of_Day'] = pd.cut(data_fe['Hour'], bins=bins, labels=labels, right=False, include_lowest=True)

# Convert Day_of_Week and Time_of_Day to category
data_fe['Day_of_Week'] = data_fe['Day_of_Week'].astype('category')
data_fe['Time_of_Day'] = data_fe['Time_of_Day'].astype('category')

In [53]:
# Step 34: Location-Based Features
# Create Route by combining Pickup_Location and Drop_Location
data_fe['Route'] = data_fe['Pickup_Location'] + ' to ' + data_fe['Drop_Location']
data_fe['Route'] = data_fe['Route'].astype('string')

# Create Is_Same_Location (True if Pickup_Location equals Drop_Location)
data_fe['Is_Same_Location'] = data_fe['Pickup_Location'] == data_fe['Drop_Location']
data_fe['Is_Same_Location'] = data_fe['Is_Same_Location'].astype('bool')

In [54]:
# Step 35: Ride Metrics
# Calculate Value_per_Distance (handle division by zero)
data_fe['Value_per_Distance'] = data_fe['Booking_Value'] / data_fe['Ride_Distance'].replace(0, np.nan)
data_fe['Value_per_Distance'] = data_fe['Value_per_Distance'].fillna(0).astype('float64')

# Create High_Value_Ride (True if Booking_Value is above 75th percentile)
booking_value_threshold = data_fe['Booking_Value'].quantile(0.75)
data_fe['High_Value_Ride'] = data_fe['Booking_Value'] > booking_value_threshold
data_fe['High_Value_Ride'] = data_fe['High_Value_Ride'].astype('bool')

In [55]:
# Step 36: Categorical Encoding
# One-hot encode Booking_Status, Vehicle_Type, and Payment_Method
categorical_columns = ['Booking_Status', 'Vehicle_Type', 'Payment_Method']
for col in categorical_columns:
    if col in data_fe.columns:
        dummies = pd.get_dummies(data_fe[col], prefix=col, drop_first=True)
        data_fe = pd.concat([data_fe, dummies], axis=1)

# Label encode Incomplete_Rides_Reason (optional, as it’s text-heavy)
if 'Incomplete_Rides_Reason' in data_fe.columns:
    le = LabelEncoder()
    data_fe['Incomplete_Rides_Reason_Encoded'] = le.fit_transform(data_fe['Incomplete_Rides_Reason'].astype(str))
    data_fe['Incomplete_Rides_Reason_Encoded'] = data_fe['Incomplete_Rides_Reason_Encoded'].astype('int32')

In [56]:
# Step 37: Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_fe.columns:
        if data_fe[col].dtype == 'string':
            print(f"\nSkipping {col}: Contains non-numeric data (string).")
        elif data_fe[col].dtype == 'float64':
            # Create binary feature for cancellations (True if value > 0)
            data_fe[f'{col}_Binary'] = data_fe[col] > 0
            data_fe[f'{col}_Binary'] = data_fe[f'{col}_Binary'].astype('bool')
            print(f"\nCreated binary feature for {col}")

In [57]:
# Step 38: Validate new features
print("\nNew Columns Added:")
display(data_fe.columns.tolist())
print("\nSample Data with New Features:")
display(data_fe[['Datetime', 'Hour', 'Day_of_Week', 'Is_Weekend', 'Time_of_Day', 
                 'Route', 'Is_Same_Location', 'Value_per_Distance', 'High_Value_Ride']].head())

print("\nData Types of New Features:")
display(data_fe[['Hour', 'Day_of_Week', 'Is_Weekend', 'Time_of_Day', 
                 'Route', 'Is_Same_Location', 'Value_per_Distance', 'High_Value_Ride']].dtypes)


New Columns Added:


['Booking_ID',
 'Booking_Status',
 'Customer_ID',
 'Vehicle_Type',
 'Pickup_Location',
 'Drop_Location',
 'V_TAT',
 'C_TAT',
 'Canceled_Rides_by_Customer',
 'Canceled_Rides_by_Driver',
 'Incomplete_Rides',
 'Incomplete_Rides_Reason',
 'Booking_Value',
 'Payment_Method',
 'Ride_Distance',
 'Driver_Ratings',
 'Customer_Rating',
 'Vehicle Images',
 'Datetime',
 'V_TAT_std',
 'V_TAT_norm',
 'C_TAT_std',
 'C_TAT_norm',
 'Booking_Value_std',
 'Booking_Value_norm',
 'Ride_Distance_std',
 'Ride_Distance_norm',
 'Driver_Ratings_std',
 'Driver_Ratings_norm',
 'Customer_Rating_std',
 'Customer_Rating_norm',
 'Hour',
 'Day_of_Week',
 'Is_Weekend',
 'Time_of_Day',
 'Route',
 'Is_Same_Location',
 'Value_per_Distance',
 'High_Value_Ride',
 'Booking_Status_Canceled by Driver',
 'Booking_Status_Driver Not Found',
 'Booking_Status_Success',
 'Vehicle_Type_Bike',
 'Vehicle_Type_Mini',
 'Vehicle_Type_Prime Plus',
 'Vehicle_Type_Prime SUV',
 'Vehicle_Type_Prime Sedan',
 'Vehicle_Type_eBike',
 'Payment_Meth


Sample Data with New Features:


Unnamed: 0,Datetime,Hour,Day_of_Week,Is_Weekend,Time_of_Day,Route,Is_Same_Location,Value_per_Distance,High_Value_Ride
0,2024-07-26 14:00:00,14,Friday,False,Afternoon,Tumkur Road to RT Nagar,False,0.0,False
1,2024-07-25 22:20:00,22,Thursday,False,Evening,Magadi Road to Varthur,False,12.153846,False
2,2024-07-30 19:59:00,19,Tuesday,False,Evening,Sahakar Nagar to Varthur,False,9.65,False
3,2024-07-22 03:15:00,3,Monday,False,Night,HSR Layout to Vijayanagar,False,0.0,False
4,2024-07-02 09:02:00,9,Tuesday,False,Morning,Rajajinagar to Chamarajpet,False,18.266667,True



Data Types of New Features:


Hour                           int32
Day_of_Week                 category
Is_Weekend                      bool
Time_of_Day                 category
Route                 string[python]
Is_Same_Location                bool
Value_per_Distance           float64
High_Value_Ride                 bool
dtype: object

In [58]:
# Step 39: Save the dataset with new features
data_fe.to_csv('bookings_feature_engineered.csv', index=False)
print("Dataset with feature engineering saved as 'bookings_feature_engineered.csv'")

Dataset with feature engineering saved as 'bookings_feature_engineered.csv'


In [59]:
# Load the dataset from the previous step
# Replace 'bookings_feature_engineered.csv' with the actual file path if not in the same directory
file_path = 'bookings_feature_engineered.csv'
data_fe = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_validated = data_fe.copy()

In [63]:
# Step 40: Convert Datetime to datetime64[ns]
if 'Datetime' in data_validated.columns:
    data_validated['Datetime'] = pd.to_datetime(data_validated['Datetime'], errors='coerce')
    print("\nConverted Datetime to datetime64[ns].")
else:
    print("\nWarning: Datetime column not found.")


Converted Datetime to datetime64[ns].


In [64]:
# Step 41: Inspect columns and data types
print("\nCurrent Columns:")
display(data_validated.columns.tolist())
print("\nCurrent Data Types:")
display(data_validated.dtypes)
print("\nSample Data:")
display(data_validated.head())


Current Columns:


['Booking_ID',
 'Booking_Status',
 'Customer_ID',
 'Vehicle_Type',
 'Pickup_Location',
 'Drop_Location',
 'V_TAT',
 'C_TAT',
 'Canceled_Rides_by_Customer',
 'Canceled_Rides_by_Driver',
 'Incomplete_Rides',
 'Incomplete_Rides_Reason',
 'Booking_Value',
 'Payment_Method',
 'Ride_Distance',
 'Driver_Ratings',
 'Customer_Rating',
 'Vehicle Images',
 'Datetime',
 'V_TAT_std',
 'V_TAT_norm',
 'C_TAT_std',
 'C_TAT_norm',
 'Booking_Value_std',
 'Booking_Value_norm',
 'Ride_Distance_std',
 'Ride_Distance_norm',
 'Driver_Ratings_std',
 'Driver_Ratings_norm',
 'Customer_Rating_std',
 'Customer_Rating_norm',
 'Hour',
 'Day_of_Week',
 'Is_Weekend',
 'Time_of_Day',
 'Route',
 'Is_Same_Location',
 'Value_per_Distance',
 'High_Value_Ride',
 'Booking_Status_Canceled by Driver',
 'Booking_Status_Driver Not Found',
 'Booking_Status_Success',
 'Vehicle_Type_Bike',
 'Vehicle_Type_Mini',
 'Vehicle_Type_Prime Plus',
 'Vehicle_Type_Prime SUV',
 'Vehicle_Type_Prime Sedan',
 'Vehicle_Type_eBike',
 'Payment_Meth


Current Data Types:


Booking_ID                                   object
Booking_Status                               object
Customer_ID                                  object
Vehicle_Type                                 object
Pickup_Location                              object
Drop_Location                                object
V_TAT                                       float64
C_TAT                                       float64
Canceled_Rides_by_Customer                   object
Canceled_Rides_by_Driver                     object
Incomplete_Rides                             object
Incomplete_Rides_Reason                      object
Booking_Value                               float64
Payment_Method                               object
Ride_Distance                               float64
Driver_Ratings                              float64
Customer_Rating                             float64
Vehicle Images                               object
Datetime                             datetime64[ns]
V_TAT_std   


Sample Data:


Unnamed: 0,Booking_ID,Booking_Status,Customer_ID,Vehicle_Type,Pickup_Location,Drop_Location,V_TAT,C_TAT,Canceled_Rides_by_Customer,Canceled_Rides_by_Driver,...,Vehicle_Type_Bike,Vehicle_Type_Mini,Vehicle_Type_Prime Plus,Vehicle_Type_Prime SUV,Vehicle_Type_Prime Sedan,Vehicle_Type_eBike,Payment_Method_Credit Card,Payment_Method_Debit Card,Payment_Method_UPI,Incomplete_Rides_Reason_Encoded
0,CNR7153255142,Canceled by Driver,CID713523,Prime Sedan,Tumkur Road,RT Nagar,0.0,0.0,,Personal & Car related issue,...,False,False,False,False,True,False,False,False,False,3
1,CNR2940424040,Success,CID225428,Bike,Magadi Road,Varthur,203.0,30.0,,,...,True,False,False,False,False,False,False,False,False,3
2,CNR2982357879,Success,CID270156,Prime SUV,Sahakar Nagar,Varthur,238.0,130.0,,,...,False,False,False,True,False,False,False,False,True,3
3,CNR2395710036,Canceled by Customer,CID581320,eBike,HSR Layout,Vijayanagar,0.0,0.0,Driver is not moving towards pickup location,,...,False,False,False,False,False,True,False,False,False,3
4,CNR1797421769,Success,CID939555,Mini,Rajajinagar,Chamarajpet,252.0,80.0,,,...,False,True,False,False,False,False,True,False,False,3


In [77]:
# Step 42: Inspect and fix Incomplete_Rides and Incomplete_Rides_Reason
print("\nInspecting Incomplete_Rides and Incomplete_Rides_Reason:")
if 'Incomplete_Rides' in data_validated.columns and 'Incomplete_Rides_Reason' in data_validated.columns:
    # Display unique values in Incomplete_Rides_Reason to identify invalid entries
    print("Unique values in Incomplete_Rides_Reason:")
    display(data_validated['Incomplete_Rides_Reason'].value_counts(dropna=False))
    
    # Identify problematic rows
    problematic_rows = data_validated[
        (data_validated['Incomplete_Rides'] == 'Yes') & 
        (data_validated['Incomplete_Rides_Reason'].isna() | 
         data_validated['Incomplete_Rides_Reason'].isin(['None', '']) | 
         data_validated['Incomplete_Rides_Reason'].str.strip().eq(''))
    ]
    print(f"Rows with Incomplete_Rides = 'Yes' and invalid Incomplete_Rides_Reason:")
    display(problematic_rows[['Incomplete_Rides', 'Incomplete_Rides_Reason']])
    
    # Impute 'Unknown' for invalid reasons
    data_validated.loc[
        (data_validated['Incomplete_Rides'] == 'Yes') & 
        (data_validated['Incomplete_Rides_Reason'].isna() | 
         data_validated['Incomplete_Rides_Reason'].isin(['None', '']) | 
         data_validated['Incomplete_Rides_Reason'].str.strip().eq('')),
        'Incomplete_Rides_Reason'
    ] = 'Unknown'
    print("Imputed 'Unknown' for invalid Incomplete_Rides_Reason where Incomplete_Rides = 'Yes'.")



Inspecting Incomplete_Rides and Incomplete_Rides_Reason:
Unique values in Incomplete_Rides_Reason:


Incomplete_Rides_Reason
NaN                  99098
Customer Demand       1601
Vehicle Breakdown     1591
Other Issue            734
Name: count, dtype: int64

Rows with Incomplete_Rides = 'Yes' and invalid Incomplete_Rides_Reason:


Unnamed: 0,Incomplete_Rides,Incomplete_Rides_Reason


Imputed 'Unknown' for invalid Incomplete_Rides_Reason where Incomplete_Rides = 'Yes'.


In [85]:
# Step 43: Inspect and clean Canceled_Rides_by_Customer and Canceled_Rides_by_Driver
print("\nInspecting Canceled_Rides_by_Customer and Canceled_Rides_by_Driver:")
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_validated.columns:
        print(f"\nUnique values in {col}:")
        display(data_validated[col].value_counts(dropna=False).head())
        # Check if column can be converted to numeric
        try:
            data_validated[col] = pd.to_numeric(data_validated[col], errors='coerce')
            if data_validated[col].isna().any():
                print(f"\nWarning: {col} contains non-numeric values. Moving text to Incomplete_Rides_Reason and imputing 1 for non-numeric, 0 for NaN.")
                # Move text to Incomplete_Rides_Reason if not already set
                text_mask = data_validated[col].isna() & ~data_validated[col].isna()
                if 'Incomplete_Rides_Reason' in data_validated.columns:
                    data_validated.loc[text_mask & data_validated['Incomplete_Rides_Reason'].isin(['None', None, '']), 'Incomplete_Rides_Reason'] = data_validated[col]
                # Impute 1 for non-numeric, 0 for NaN
                data_validated[col] = data_validated[col].isna().astype(float)
            else:
                print(f"\n{col} is numeric.")
        except:
            print(f"\n{col} contains non-numeric data. Skipping numeric conversion.")



Inspecting Canceled_Rides_by_Customer and Canceled_Rides_by_Driver:

Unique values in Canceled_Rides_by_Customer:


Canceled_Rides_by_Customer
NaN                                             92525
Driver is not moving towards pickup location     3175
Driver asked to cancel                           2670
Change of plans                                  2081
AC is Not working                                1568
Name: count, dtype: int64



Unique values in Canceled_Rides_by_Driver:


Canceled_Rides_by_Driver
NaN                                    84590
Personal & Car related issue            6542
Customer related issue                  5413
Customer was coughing/sick              3654
More than permitted people in there     2825
Name: count, dtype: int64




In [86]:
# Step 44: Check for missing values
print("\nMissing Values Check:")
missing_values = data_validated.isnull().sum()
display(missing_values[missing_values > 0])
if missing_values.sum() == 0:
    print("No missing values found.")


Missing Values Check:


Incomplete_Rides           39057
Incomplete_Rides_Reason    99098
Payment_Method             39057
dtype: int64

In [87]:
# Step 45: Check for duplicates
print("\nDuplicate Rows Check:")
print(f"Number of duplicate rows: {data_validated.duplicated().sum()}")
print(f"Number of duplicate Booking_IDs: {data_validated.duplicated(subset=['Booking_ID']).sum()}")


Duplicate Rows Check:
Number of duplicate rows: 0
Number of duplicate Booking_IDs: 0


In [88]:
# Step 46:Validate data types
expected_dtypes = {
    'Datetime': 'datetime64[ns]',
    'Booking_ID': 'string',
    'Booking_Status': 'category',
    'Customer_ID': 'string',
    'Vehicle_Type': 'category',
    'Pickup_Location': 'string',
    'Drop_Location': 'string',
    'V_TAT': 'float64',
    'C_TAT': 'float64',
    'Booking_Value': 'float64',
    'Payment_Method': 'category',
    'Ride_Distance': 'float64',
    'Driver_Ratings': 'float64',
    'Customer_Rating': 'float64',
    'Incomplete_Rides': 'category',
    'Incomplete_Rides_Reason': 'string',
    'Vehicle Images': 'string',
    'Canceled_Rides_by_Customer': 'float64',
    'Canceled_Rides_by_Driver': 'float64',
    'V_TAT_std': 'float64', 'V_TAT_norm': 'float64',
    'C_TAT_std': 'float64', 'C_TAT_norm': 'float64',
    'Booking_Value_std': 'float64', 'Booking_Value_norm': 'float64',
    'Ride_Distance_std': 'float64', 'Ride_Distance_norm': 'float64',
    'Driver_Ratings_std': 'float64', 'Driver_Ratings_norm': 'float64',
    'Customer_Rating_std': 'float64', 'Customer_Rating_norm': 'float64',
    'Hour': 'int32',
    'Day_of_Week': 'category',
    'Is_Weekend': 'bool',
    'Time_of_Day': 'category',
    'Route': 'string',
    'Is_Same_Location': 'bool',
    'Value_per_Distance': 'float64',
    'High_Value_Ride': 'bool',
    'Incomplete_Rides_Reason_Encoded': 'int32'
}
print("\nData Types Validation:")
for col, dtype in expected_dtypes.items():
    if col in data_validated.columns:
        actual_dtype = str(data_validated[col].dtype)
        if actual_dtype != dtype:
            print(f"Warning: {col} has type {actual_dtype}, expected {dtype}")
        else:
            print(f"{col}: Type {actual_dtype} matches expected {dtype}")
    else:
        print(f"Warning: {col} not found in dataset.")



Data Types Validation:
Datetime: Type datetime64[ns] matches expected datetime64[ns]
V_TAT: Type float64 matches expected float64
C_TAT: Type float64 matches expected float64
Booking_Value: Type float64 matches expected float64
Ride_Distance: Type float64 matches expected float64
Driver_Ratings: Type float64 matches expected float64
Customer_Rating: Type float64 matches expected float64
Canceled_Rides_by_Customer: Type float64 matches expected float64
Canceled_Rides_by_Driver: Type float64 matches expected float64
V_TAT_std: Type float64 matches expected float64
V_TAT_norm: Type float64 matches expected float64
C_TAT_std: Type float64 matches expected float64
C_TAT_norm: Type float64 matches expected float64
Booking_Value_std: Type float64 matches expected float64
Booking_Value_norm: Type float64 matches expected float64
Ride_Distance_std: Type float64 matches expected float64
Ride_Distance_norm: Type float64 matches expected float64
Driver_Ratings_std: Type float64 matches expected f

In [89]:
# Step 47: Validate numeric ranges
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
print("\nNumeric Ranges Check:")
for col in numeric_columns:
    if col in data_validated.columns:
        min_val = data_validated[col].min()
        max_val = data_validated[col].max()
        print(f"{col}: Min = {min_val:.2f}, Max = {max_val:.2f}")
        if col in ['Driver_Ratings', 'Customer_Rating']:
            assert 0 <= min_val <= max_val <= 5, f"{col} out of range (0–5)"
        else:
            assert min_val >= 0, f"{col} has negative values"

# Validate standardized and normalized columns
std_columns = [f'{col}_std' for col in numeric_columns if f'{col}_std' in data_validated.columns]
norm_columns = [f'{col}_norm' for col in numeric_columns if f'{col}_norm' in data_validated.columns]
print("\nStandardized Columns Check (Mean ≈ 0, Std ≈ 1):")
display(data_validated[std_columns].describe())
print("\nNormalized Columns Check (Range [0, 1]):")
display(data_validated[norm_columns].describe())


Numeric Ranges Check:
V_TAT: Min = 0.00, Max = 287.00
C_TAT: Min = 0.00, Max = 135.00
Booking_Value: Min = 128.00, Max = 1899.00
Ride_Distance: Min = 0.00, Max = 45.00
Driver_Ratings: Min = 0.00, Max = 5.00
Customer_Rating: Min = 0.00, Max = 5.00

Standardized Columns Check (Mean ≈ 0, Std ≈ 1):


Unnamed: 0,V_TAT_std,C_TAT_std,Booking_Value_std,Ride_Distance_std,Driver_Ratings_std,Customer_Rating_std
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,-6.400292000000001e-17,-1.5173110000000003e-17,-5.1312680000000006e-17,-3.351877e-17,3.400155e-17,6.455467e-17
std,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005
min,-1.020353,-1.06019,-0.8993799,-0.9053051,-1.245999,-1.245774
25%,-1.020353,-1.06019,-0.6390812,-0.9053051,-1.245999,-1.245774
50%,-0.2074923,-0.1483969,-0.310283,-0.3910651,0.4608509,0.460402
75%,0.8763223,0.8647067,0.2262975,0.7659748,0.8624627,0.8618552
max,1.756922,1.67519,3.144382,1.987295,1.264075,1.263308



Normalized Columns Check (Range [0, 1]):


Unnamed: 0,V_TAT_norm,C_TAT_norm,Booking_Value_norm,Ride_Distance_norm,Driver_Ratings_norm,Customer_Rating_norm
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,0.367394,0.387584,0.222412,0.312973,0.496399,0.496506
std,0.360067,0.365582,0.247296,0.345711,0.398397,0.398554
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.06437,0.0,0.0,0.0
50%,0.292683,0.333333,0.14568,0.177778,0.68,0.68
75%,0.682927,0.703704,0.278374,0.577778,0.84,0.84
max,1.0,1.0,1.0,1.0,1.0,1.0


In [90]:
# Step 48: Validate engineered features
print("\nEngineered Features Validation:")
# Check Hour (0–23)
if 'Hour' in data_validated.columns:
    assert data_validated['Hour'].between(0, 23).all(), "Hour values out of range (0–23)"

# Check Is_Weekend aligns with Day_of_Week
if 'Is_Weekend' in data_validated.columns and 'Day_of_Week' in data_validated.columns:
    weekend_days = ['Saturday', 'Sunday']
    assert (data_validated[data_validated['Is_Weekend']]['Day_of_Week'].isin(weekend_days)).all(), "Is_Weekend mismatch with Day_of_Week"

# Check Time_of_Day aligns with Hour
if 'Time_of_Day' in data_validated.columns and 'Hour' in data_validated.columns:
    time_check = (
        ((data_validated['Hour'] < 6) & (data_validated['Time_of_Day'] == 'Night')) |
        ((data_validated['Hour'].between(6, 11)) & (data_validated['Time_of_Day'] == 'Morning')) |
        ((data_validated['Hour'].between(12, 17)) & (data_validated['Time_of_Day'] == 'Afternoon')) |
        ((data_validated['Hour'].between(18, 23)) & (data_validated['Time_of_Day'] == 'Evening'))
    )
    assert time_check.all(), "Time_of_Day mismatch with Hour"

# Check Route matches Pickup_Location and Drop_Location
if 'Route' in data_validated.columns and 'Pickup_Location' in data_validated.columns and 'Drop_Location' in data_validated.columns:
    route_check = (data_validated['Route'] == data_validated['Pickup_Location'] + ' to ' + data_validated['Drop_Location'])
    assert route_check.all(), "Route does not match Pickup_Location and Drop_Location"

# Check Is_Same_Location
if 'Is_Same_Location' in data_validated.columns:
    same_location_check = (data_validated['Is_Same_Location'] == (data_validated['Pickup_Location'] == data_validated['Drop_Location']))
    assert same_location_check.all(), "Is_Same_Location mismatch"

# Check Value_per_Distance
if 'Value_per_Distance' in data_validated.columns:
    assert data_validated['Value_per_Distance'].min() >= 0, "Value_per_Distance has negative values"

# Check High_Value_Ride
if 'High_Value_Ride' in data_validated.columns and 'Booking_Value' in data_validated.columns:
    threshold = data_validated['Booking_Value'].quantile(0.75)
    assert (data_validated['High_Value_Ride'] == (data_validated['Booking_Value'] > threshold)).all(), "High_Value_Ride mismatch"


Engineered Features Validation:


In [91]:
# Step 49: Logical consistency checks
print("\nLogical Consistency Checks:")
# Modified check for Incomplete_Rides and Incomplete_Rides_Reason
if 'Incomplete_Rides' in data_validated.columns and 'Incomplete_Rides_Reason' in data_validated.columns:
    problematic_rows_after = data_validated[
        (data_validated['Incomplete_Rides'] == 'Yes') & 
        (data_validated['Incomplete_Rides_Reason'].isna() | 
         data_validated['Incomplete_Rides_Reason'].isin(['None', '']) | 
         data_validated['Incomplete_Rides_Reason'].str.strip().eq(''))
    ]
    if not problematic_rows_after.empty:
        print("Warning: Some rows still have Incomplete_Rides = 'Yes' with invalid Incomplete_Rides_Reason:")
        display(problematic_rows_after[['Incomplete_Rides', 'Incomplete_Rides_Reason']])
    else:
        print("Incomplete_Rides and Incomplete_Rides_Reason are consistent.")

# Success rides should generally have non-zero Booking_Value and Ride_Distance
if 'Booking_Status' in data_validated.columns:
    success_rides = data_validated[data_validated['Booking_Status'] == 'Success']
    value_check = (success_rides['Booking_Value'] > 0).all()
    distance_check = (success_rides['Ride_Distance'] > 0).all()
    print(f"Success rides with non-zero Booking_Value: {value_check}")
    print(f"Success rides with non-zero Ride_Distance: {distance_check}")




Logical Consistency Checks:
Incomplete_Rides and Incomplete_Rides_Reason are consistent.
Success rides with non-zero Booking_Value: True
Success rides with non-zero Ride_Distance: True


In [92]:
# Step 50: Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_validated.columns:
        if data_validated[col].dtype in ['string', 'object'] or data_validated[col].isna().any():
            print(f"\n{col}: Contains text or mixed data. Sample values:")
            display(data_validated[col].value_counts(dropna=False).head())
        else:
            print(f"\n{col}: Numeric. Range:")
            print(f"Min = {data_validated[col].min():.2f}, Max = {data_validated[col].max():.2f}")


Canceled_Rides_by_Customer: Numeric. Range:
Min = 1.00, Max = 1.00

Canceled_Rides_by_Driver: Numeric. Range:
Min = 1.00, Max = 1.00


In [93]:
# Step 51: Save the validated dataset
data_validated.to_csv('bookings_validated.csv', index=False)
print("\nDataset validated and saved as 'bookings_validated.csv'")


Dataset validated and saved as 'bookings_validated.csv'
