In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset from the previous step
# Replace 'bookings_feature_engineered.csv' with the actual file path if not in the same directory
file_path = 'bookings_feature_engineered.csv'
data_fe = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_validated = data_fe.copy()

In [3]:
# Step 1: Convert Datetime to datetime64[ns]
if 'Datetime' in data_validated.columns:
    data_validated['Datetime'] = pd.to_datetime(data_validated['Datetime'], errors='coerce')
    print("\nConverted Datetime to datetime64[ns].")
else:
    print("\nWarning: Datetime column not found.")


Converted Datetime to datetime64[ns].


In [4]:
# Step 2: Inspect columns and data types
print("\nCurrent Columns:")
display(data_validated.columns.tolist())
print("\nCurrent Data Types:")
display(data_validated.dtypes)
print("\nSample Data:")
display(data_validated.head())


Current Columns:


['Booking_ID',
 'Booking_Status',
 'Customer_ID',
 'Vehicle_Type',
 'Pickup_Location',
 'Drop_Location',
 'V_TAT',
 'C_TAT',
 'Canceled_Rides_by_Customer',
 'Canceled_Rides_by_Driver',
 'Incomplete_Rides',
 'Incomplete_Rides_Reason',
 'Booking_Value',
 'Payment_Method',
 'Ride_Distance',
 'Driver_Ratings',
 'Customer_Rating',
 'Vehicle Images',
 'Datetime',
 'V_TAT_std',
 'V_TAT_norm',
 'C_TAT_std',
 'C_TAT_norm',
 'Booking_Value_std',
 'Booking_Value_norm',
 'Ride_Distance_std',
 'Ride_Distance_norm',
 'Driver_Ratings_std',
 'Driver_Ratings_norm',
 'Customer_Rating_std',
 'Customer_Rating_norm',
 'Hour',
 'Day_of_Week',
 'Is_Weekend',
 'Time_of_Day',
 'Route',
 'Is_Same_Location',
 'Value_per_Distance',
 'High_Value_Ride',
 'Booking_Status_Canceled by Driver',
 'Booking_Status_Driver Not Found',
 'Booking_Status_Success',
 'Vehicle_Type_Bike',
 'Vehicle_Type_Mini',
 'Vehicle_Type_Prime Plus',
 'Vehicle_Type_Prime SUV',
 'Vehicle_Type_Prime Sedan',
 'Vehicle_Type_eBike',
 'Payment_Meth


Current Data Types:


Booking_ID                                   object
Booking_Status                               object
Customer_ID                                  object
Vehicle_Type                                 object
Pickup_Location                              object
Drop_Location                                object
V_TAT                                       float64
C_TAT                                       float64
Canceled_Rides_by_Customer                   object
Canceled_Rides_by_Driver                     object
Incomplete_Rides                             object
Incomplete_Rides_Reason                      object
Booking_Value                               float64
Payment_Method                               object
Ride_Distance                               float64
Driver_Ratings                              float64
Customer_Rating                             float64
Vehicle Images                               object
Datetime                             datetime64[ns]
V_TAT_std   


Sample Data:


Unnamed: 0,Booking_ID,Booking_Status,Customer_ID,Vehicle_Type,Pickup_Location,Drop_Location,V_TAT,C_TAT,Canceled_Rides_by_Customer,Canceled_Rides_by_Driver,...,Vehicle_Type_Bike,Vehicle_Type_Mini,Vehicle_Type_Prime Plus,Vehicle_Type_Prime SUV,Vehicle_Type_Prime Sedan,Vehicle_Type_eBike,Payment_Method_Credit Card,Payment_Method_Debit Card,Payment_Method_UPI,Incomplete_Rides_Reason_Encoded
0,CNR7153255142,Canceled by Driver,CID713523,Prime Sedan,Tumkur Road,RT Nagar,0.0,0.0,,Personal & Car related issue,...,False,False,False,False,True,False,False,False,False,3
1,CNR2940424040,Success,CID225428,Bike,Magadi Road,Varthur,203.0,30.0,,,...,True,False,False,False,False,False,False,False,False,3
2,CNR2982357879,Success,CID270156,Prime SUV,Sahakar Nagar,Varthur,238.0,130.0,,,...,False,False,False,True,False,False,False,False,True,3
3,CNR2395710036,Canceled by Customer,CID581320,eBike,HSR Layout,Vijayanagar,0.0,0.0,Driver is not moving towards pickup location,,...,False,False,False,False,False,True,False,False,False,3
4,CNR1797421769,Success,CID939555,Mini,Rajajinagar,Chamarajpet,252.0,80.0,,,...,False,True,False,False,False,False,True,False,False,3


In [5]:
# Step 3: Inspect and fix Incomplete_Rides and Incomplete_Rides_Reason
print("\nInspecting Incomplete_Rides and Incomplete_Rides_Reason:")
if 'Incomplete_Rides' in data_validated.columns and 'Incomplete_Rides_Reason' in data_validated.columns:
    # Display unique values in Incomplete_Rides_Reason to identify invalid entries
    print("Unique values in Incomplete_Rides_Reason:")
    display(data_validated['Incomplete_Rides_Reason'].value_counts(dropna=False))
    
    # Identify problematic rows
    problematic_rows = data_validated[
        (data_validated['Incomplete_Rides'] == 'Yes') & 
        (data_validated['Incomplete_Rides_Reason'].isna() | 
         data_validated['Incomplete_Rides_Reason'].isin(['None', '']) | 
         data_validated['Incomplete_Rides_Reason'].str.strip().eq(''))
    ]
    print(f"Rows with Incomplete_Rides = 'Yes' and invalid Incomplete_Rides_Reason:")
    display(problematic_rows[['Incomplete_Rides', 'Incomplete_Rides_Reason']])
    
    # Impute 'Unknown' for invalid reasons
    data_validated.loc[
        (data_validated['Incomplete_Rides'] == 'Yes') & 
        (data_validated['Incomplete_Rides_Reason'].isna() | 
         data_validated['Incomplete_Rides_Reason'].isin(['None', '']) | 
         data_validated['Incomplete_Rides_Reason'].str.strip().eq('')),
        'Incomplete_Rides_Reason'
    ] = 'Unknown'
    print("Imputed 'Unknown' for invalid Incomplete_Rides_Reason where Incomplete_Rides = 'Yes'.")



Inspecting Incomplete_Rides and Incomplete_Rides_Reason:
Unique values in Incomplete_Rides_Reason:


Incomplete_Rides_Reason
NaN                  99098
Customer Demand       1601
Vehicle Breakdown     1591
Other Issue            734
Name: count, dtype: int64

Rows with Incomplete_Rides = 'Yes' and invalid Incomplete_Rides_Reason:


Unnamed: 0,Incomplete_Rides,Incomplete_Rides_Reason


Imputed 'Unknown' for invalid Incomplete_Rides_Reason where Incomplete_Rides = 'Yes'.


In [6]:
# Step 4: Inspect and clean Canceled_Rides_by_Customer and Canceled_Rides_by_Driver
print("\nInspecting Canceled_Rides_by_Customer and Canceled_Rides_by_Driver:")
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_validated.columns:
        print(f"\nUnique values in {col}:")
        display(data_validated[col].value_counts(dropna=False).head())
        # Check if column can be converted to numeric
        try:
            data_validated[col] = pd.to_numeric(data_validated[col], errors='coerce')
            if data_validated[col].isna().any():
                print(f"\nWarning: {col} contains non-numeric values. Moving text to Incomplete_Rides_Reason and imputing 1 for non-numeric, 0 for NaN.")
                # Move text to Incomplete_Rides_Reason if not already set
                text_mask = data_validated[col].isna() & ~data_validated[col].isna()
                if 'Incomplete_Rides_Reason' in data_validated.columns:
                    data_validated.loc[text_mask & data_validated['Incomplete_Rides_Reason'].isin(['None', None, '']), 'Incomplete_Rides_Reason'] = data_validated[col]
                # Impute 1 for non-numeric, 0 for NaN
                data_validated[col] = data_validated[col].isna().astype(float)
            else:
                print(f"\n{col} is numeric.")
        except:
            print(f"\n{col} contains non-numeric data. Skipping numeric conversion.")



Inspecting Canceled_Rides_by_Customer and Canceled_Rides_by_Driver:

Unique values in Canceled_Rides_by_Customer:


Canceled_Rides_by_Customer
NaN                                             92525
Driver is not moving towards pickup location     3175
Driver asked to cancel                           2670
Change of plans                                  2081
AC is Not working                                1568
Name: count, dtype: int64



Unique values in Canceled_Rides_by_Driver:


Canceled_Rides_by_Driver
NaN                                    84590
Personal & Car related issue            6542
Customer related issue                  5413
Customer was coughing/sick              3654
More than permitted people in there     2825
Name: count, dtype: int64




In [7]:
# Step 5: Check for missing values
print("\nMissing Values Check:")
missing_values = data_validated.isnull().sum()
display(missing_values[missing_values > 0])
if missing_values.sum() == 0:
    print("No missing values found.")


Missing Values Check:


Incomplete_Rides           39057
Incomplete_Rides_Reason    99098
Payment_Method             39057
dtype: int64

In [8]:
# Step 6: Check for duplicates
print("\nDuplicate Rows Check:")
print(f"Number of duplicate rows: {data_validated.duplicated().sum()}")
print(f"Number of duplicate Booking_IDs: {data_validated.duplicated(subset=['Booking_ID']).sum()}")


Duplicate Rows Check:
Number of duplicate rows: 0
Number of duplicate Booking_IDs: 0


In [9]:
# Step 7:Validate data types
expected_dtypes = {
    'Datetime': 'datetime64[ns]',
    'Booking_ID': 'string',
    'Booking_Status': 'category',
    'Customer_ID': 'string',
    'Vehicle_Type': 'category',
    'Pickup_Location': 'string',
    'Drop_Location': 'string',
    'V_TAT': 'float64',
    'C_TAT': 'float64',
    'Booking_Value': 'float64',
    'Payment_Method': 'category',
    'Ride_Distance': 'float64',
    'Driver_Ratings': 'float64',
    'Customer_Rating': 'float64',
    'Incomplete_Rides': 'category',
    'Incomplete_Rides_Reason': 'string',
    'Vehicle Images': 'string',
    'Canceled_Rides_by_Customer': 'float64',
    'Canceled_Rides_by_Driver': 'float64',
    'V_TAT_std': 'float64', 'V_TAT_norm': 'float64',
    'C_TAT_std': 'float64', 'C_TAT_norm': 'float64',
    'Booking_Value_std': 'float64', 'Booking_Value_norm': 'float64',
    'Ride_Distance_std': 'float64', 'Ride_Distance_norm': 'float64',
    'Driver_Ratings_std': 'float64', 'Driver_Ratings_norm': 'float64',
    'Customer_Rating_std': 'float64', 'Customer_Rating_norm': 'float64',
    'Hour': 'int32',
    'Day_of_Week': 'category',
    'Is_Weekend': 'bool',
    'Time_of_Day': 'category',
    'Route': 'string',
    'Is_Same_Location': 'bool',
    'Value_per_Distance': 'float64',
    'High_Value_Ride': 'bool',
    'Incomplete_Rides_Reason_Encoded': 'int32'
}
print("\nData Types Validation:")
for col, dtype in expected_dtypes.items():
    if col in data_validated.columns:
        actual_dtype = str(data_validated[col].dtype)
        if actual_dtype != dtype:
            print(f"Warning: {col} has type {actual_dtype}, expected {dtype}")
        else:
            print(f"{col}: Type {actual_dtype} matches expected {dtype}")
    else:
        print(f"Warning: {col} not found in dataset.")



Data Types Validation:
Datetime: Type datetime64[ns] matches expected datetime64[ns]
V_TAT: Type float64 matches expected float64
C_TAT: Type float64 matches expected float64
Booking_Value: Type float64 matches expected float64
Ride_Distance: Type float64 matches expected float64
Driver_Ratings: Type float64 matches expected float64
Customer_Rating: Type float64 matches expected float64
Canceled_Rides_by_Customer: Type float64 matches expected float64
Canceled_Rides_by_Driver: Type float64 matches expected float64
V_TAT_std: Type float64 matches expected float64
V_TAT_norm: Type float64 matches expected float64
C_TAT_std: Type float64 matches expected float64
C_TAT_norm: Type float64 matches expected float64
Booking_Value_std: Type float64 matches expected float64
Booking_Value_norm: Type float64 matches expected float64
Ride_Distance_std: Type float64 matches expected float64
Ride_Distance_norm: Type float64 matches expected float64
Driver_Ratings_std: Type float64 matches expected f

In [10]:
# Step 8: Validate numeric ranges
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
print("\nNumeric Ranges Check:")
for col in numeric_columns:
    if col in data_validated.columns:
        min_val = data_validated[col].min()
        max_val = data_validated[col].max()
        print(f"{col}: Min = {min_val:.2f}, Max = {max_val:.2f}")
        if col in ['Driver_Ratings', 'Customer_Rating']:
            assert 0 <= min_val <= max_val <= 5, f"{col} out of range (0–5)"
        else:
            assert min_val >= 0, f"{col} has negative values"

# Validate standardized and normalized columns
std_columns = [f'{col}_std' for col in numeric_columns if f'{col}_std' in data_validated.columns]
norm_columns = [f'{col}_norm' for col in numeric_columns if f'{col}_norm' in data_validated.columns]
print("\nStandardized Columns Check (Mean ≈ 0, Std ≈ 1):")
display(data_validated[std_columns].describe())
print("\nNormalized Columns Check (Range [0, 1]):")
display(data_validated[norm_columns].describe())


Numeric Ranges Check:
V_TAT: Min = 0.00, Max = 287.00
C_TAT: Min = 0.00, Max = 135.00
Booking_Value: Min = 128.00, Max = 1899.00
Ride_Distance: Min = 0.00, Max = 45.00
Driver_Ratings: Min = 0.00, Max = 5.00
Customer_Rating: Min = 0.00, Max = 5.00

Standardized Columns Check (Mean ≈ 0, Std ≈ 1):


Unnamed: 0,V_TAT_std,C_TAT_std,Booking_Value_std,Ride_Distance_std,Driver_Ratings_std,Customer_Rating_std
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,-6.400292000000001e-17,-1.5173110000000003e-17,-5.1312680000000006e-17,-3.351877e-17,3.400155e-17,6.455467e-17
std,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005
min,-1.020353,-1.06019,-0.8993799,-0.9053051,-1.245999,-1.245774
25%,-1.020353,-1.06019,-0.6390812,-0.9053051,-1.245999,-1.245774
50%,-0.2074923,-0.1483969,-0.310283,-0.3910651,0.4608509,0.460402
75%,0.8763223,0.8647067,0.2262975,0.7659748,0.8624627,0.8618552
max,1.756922,1.67519,3.144382,1.987295,1.264075,1.263308



Normalized Columns Check (Range [0, 1]):


Unnamed: 0,V_TAT_norm,C_TAT_norm,Booking_Value_norm,Ride_Distance_norm,Driver_Ratings_norm,Customer_Rating_norm
count,103024.0,103024.0,103024.0,103024.0,103024.0,103024.0
mean,0.367394,0.387584,0.222412,0.312973,0.496399,0.496506
std,0.360067,0.365582,0.247296,0.345711,0.398397,0.398554
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.06437,0.0,0.0,0.0
50%,0.292683,0.333333,0.14568,0.177778,0.68,0.68
75%,0.682927,0.703704,0.278374,0.577778,0.84,0.84
max,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
# Step 9: Validate engineered features
print("\nEngineered Features Validation:")
# Check Hour (0–23)
if 'Hour' in data_validated.columns:
    assert data_validated['Hour'].between(0, 23).all(), "Hour values out of range (0–23)"

# Check Is_Weekend aligns with Day_of_Week
if 'Is_Weekend' in data_validated.columns and 'Day_of_Week' in data_validated.columns:
    weekend_days = ['Saturday', 'Sunday']
    assert (data_validated[data_validated['Is_Weekend']]['Day_of_Week'].isin(weekend_days)).all(), "Is_Weekend mismatch with Day_of_Week"

# Check Time_of_Day aligns with Hour
if 'Time_of_Day' in data_validated.columns and 'Hour' in data_validated.columns:
    time_check = (
        ((data_validated['Hour'] < 6) & (data_validated['Time_of_Day'] == 'Night')) |
        ((data_validated['Hour'].between(6, 11)) & (data_validated['Time_of_Day'] == 'Morning')) |
        ((data_validated['Hour'].between(12, 17)) & (data_validated['Time_of_Day'] == 'Afternoon')) |
        ((data_validated['Hour'].between(18, 23)) & (data_validated['Time_of_Day'] == 'Evening'))
    )
    assert time_check.all(), "Time_of_Day mismatch with Hour"

# Check Route matches Pickup_Location and Drop_Location
if 'Route' in data_validated.columns and 'Pickup_Location' in data_validated.columns and 'Drop_Location' in data_validated.columns:
    route_check = (data_validated['Route'] == data_validated['Pickup_Location'] + ' to ' + data_validated['Drop_Location'])
    assert route_check.all(), "Route does not match Pickup_Location and Drop_Location"

# Check Is_Same_Location
if 'Is_Same_Location' in data_validated.columns:
    same_location_check = (data_validated['Is_Same_Location'] == (data_validated['Pickup_Location'] == data_validated['Drop_Location']))
    assert same_location_check.all(), "Is_Same_Location mismatch"

# Check Value_per_Distance
if 'Value_per_Distance' in data_validated.columns:
    assert data_validated['Value_per_Distance'].min() >= 0, "Value_per_Distance has negative values"

# Check High_Value_Ride
if 'High_Value_Ride' in data_validated.columns and 'Booking_Value' in data_validated.columns:
    threshold = data_validated['Booking_Value'].quantile(0.75)
    assert (data_validated['High_Value_Ride'] == (data_validated['Booking_Value'] > threshold)).all(), "High_Value_Ride mismatch"


Engineered Features Validation:


In [12]:
# Step 10: Logical consistency checks
print("\nLogical Consistency Checks:")
# Modified check for Incomplete_Rides and Incomplete_Rides_Reason
if 'Incomplete_Rides' in data_validated.columns and 'Incomplete_Rides_Reason' in data_validated.columns:
    problematic_rows_after = data_validated[
        (data_validated['Incomplete_Rides'] == 'Yes') & 
        (data_validated['Incomplete_Rides_Reason'].isna() | 
         data_validated['Incomplete_Rides_Reason'].isin(['None', '']) | 
         data_validated['Incomplete_Rides_Reason'].str.strip().eq(''))
    ]
    if not problematic_rows_after.empty:
        print("Warning: Some rows still have Incomplete_Rides = 'Yes' with invalid Incomplete_Rides_Reason:")
        display(problematic_rows_after[['Incomplete_Rides', 'Incomplete_Rides_Reason']])
    else:
        print("Incomplete_Rides and Incomplete_Rides_Reason are consistent.")

# Success rides should generally have non-zero Booking_Value and Ride_Distance
if 'Booking_Status' in data_validated.columns:
    success_rides = data_validated[data_validated['Booking_Status'] == 'Success']
    value_check = (success_rides['Booking_Value'] > 0).all()
    distance_check = (success_rides['Ride_Distance'] > 0).all()
    print(f"Success rides with non-zero Booking_Value: {value_check}")
    print(f"Success rides with non-zero Ride_Distance: {distance_check}")




Logical Consistency Checks:
Incomplete_Rides and Incomplete_Rides_Reason are consistent.
Success rides with non-zero Booking_Value: True
Success rides with non-zero Ride_Distance: True


In [13]:
# Step 11: Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_validated.columns:
        if data_validated[col].dtype in ['string', 'object'] or data_validated[col].isna().any():
            print(f"\n{col}: Contains text or mixed data. Sample values:")
            display(data_validated[col].value_counts(dropna=False).head())
        else:
            print(f"\n{col}: Numeric. Range:")
            print(f"Min = {data_validated[col].min():.2f}, Max = {data_validated[col].max():.2f}")


Canceled_Rides_by_Customer: Numeric. Range:
Min = 1.00, Max = 1.00

Canceled_Rides_by_Driver: Numeric. Range:
Min = 1.00, Max = 1.00


In [14]:
# Step 12: Save the validated dataset
data_validated.to_csv('bookings_validated.csv', index=False)
print("\nDataset validated and saved as 'bookings_validated.csv'")


Dataset validated and saved as 'bookings_validated.csv'
