In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset from the previous step
# Replace 'bookings_feature_engineered.csv' with the actual file path if not in the same directory
file_path = 'bookings_feature_engineered.csv'
data_fe = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_validated = data_fe.copy()

In [None]:
# Step 1: Convert Datetime to datetime64[ns]
if 'Datetime' in data_validated.columns:
    data_validated['Datetime'] = pd.to_datetime(data_validated['Datetime'], errors='coerce')
    print("\nConverted Datetime to datetime64[ns].")
else:
    print("\nWarning: Datetime column not found.")

In [None]:
# Step 2: Inspect columns and data types
print("\nCurrent Columns:")
display(data_validated.columns.tolist())
print("\nCurrent Data Types:")
display(data_validated.dtypes)
print("\nSample Data:")
display(data_validated.head())

In [None]:
# Step 3: Inspect and fix Incomplete_Rides and Incomplete_Rides_Reason
print("\nInspecting Incomplete_Rides and Incomplete_Rides_Reason:")
if 'Incomplete_Rides' in data_validated.columns and 'Incomplete_Rides_Reason' in data_validated.columns:
    # Display unique values in Incomplete_Rides_Reason to identify invalid entries
    print("Unique values in Incomplete_Rides_Reason:")
    display(data_validated['Incomplete_Rides_Reason'].value_counts(dropna=False))
    
    # Identify problematic rows
    problematic_rows = data_validated[
        (data_validated['Incomplete_Rides'] == 'Yes') & 
        (data_validated['Incomplete_Rides_Reason'].isna() | 
         data_validated['Incomplete_Rides_Reason'].isin(['None', '']) | 
         data_validated['Incomplete_Rides_Reason'].str.strip().eq(''))
    ]
    print(f"Rows with Incomplete_Rides = 'Yes' and invalid Incomplete_Rides_Reason:")
    display(problematic_rows[['Incomplete_Rides', 'Incomplete_Rides_Reason']])
    
    # Impute 'Unknown' for invalid reasons
    data_validated.loc[
        (data_validated['Incomplete_Rides'] == 'Yes') & 
        (data_validated['Incomplete_Rides_Reason'].isna() | 
         data_validated['Incomplete_Rides_Reason'].isin(['None', '']) | 
         data_validated['Incomplete_Rides_Reason'].str.strip().eq('')),
        'Incomplete_Rides_Reason'
    ] = 'Unknown'
    print("Imputed 'Unknown' for invalid Incomplete_Rides_Reason where Incomplete_Rides = 'Yes'.")


In [None]:
# Step 4: Inspect and clean Canceled_Rides_by_Customer and Canceled_Rides_by_Driver
print("\nInspecting Canceled_Rides_by_Customer and Canceled_Rides_by_Driver:")
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_validated.columns:
        print(f"\nUnique values in {col}:")
        display(data_validated[col].value_counts(dropna=False).head())
        # Check if column can be converted to numeric
        try:
            data_validated[col] = pd.to_numeric(data_validated[col], errors='coerce')
            if data_validated[col].isna().any():
                print(f"\nWarning: {col} contains non-numeric values. Moving text to Incomplete_Rides_Reason and imputing 1 for non-numeric, 0 for NaN.")
                # Move text to Incomplete_Rides_Reason if not already set
                text_mask = data_validated[col].isna() & ~data_validated[col].isna()
                if 'Incomplete_Rides_Reason' in data_validated.columns:
                    data_validated.loc[text_mask & data_validated['Incomplete_Rides_Reason'].isin(['None', None, '']), 'Incomplete_Rides_Reason'] = data_validated[col]
                # Impute 1 for non-numeric, 0 for NaN
                data_validated[col] = data_validated[col].isna().astype(float)
            else:
                print(f"\n{col} is numeric.")
        except:
            print(f"\n{col} contains non-numeric data. Skipping numeric conversion.")


In [None]:
# Step 5: Check for missing values
print("\nMissing Values Check:")
missing_values = data_validated.isnull().sum()
display(missing_values[missing_values > 0])
if missing_values.sum() == 0:
    print("No missing values found.")

In [None]:
# Step 6: Check for duplicates
print("\nDuplicate Rows Check:")
print(f"Number of duplicate rows: {data_validated.duplicated().sum()}")
print(f"Number of duplicate Booking_IDs: {data_validated.duplicated(subset=['Booking_ID']).sum()}")

In [None]:
# Step 7:Validate data types
expected_dtypes = {
    'Datetime': 'datetime64[ns]',
    'Booking_ID': 'string',
    'Booking_Status': 'category',
    'Customer_ID': 'string',
    'Vehicle_Type': 'category',
    'Pickup_Location': 'string',
    'Drop_Location': 'string',
    'V_TAT': 'float64',
    'C_TAT': 'float64',
    'Booking_Value': 'float64',
    'Payment_Method': 'category',
    'Ride_Distance': 'float64',
    'Driver_Ratings': 'float64',
    'Customer_Rating': 'float64',
    'Incomplete_Rides': 'category',
    'Incomplete_Rides_Reason': 'string',
    'Vehicle Images': 'string',
    'Canceled_Rides_by_Customer': 'float64',
    'Canceled_Rides_by_Driver': 'float64',
    'V_TAT_std': 'float64', 'V_TAT_norm': 'float64',
    'C_TAT_std': 'float64', 'C_TAT_norm': 'float64',
    'Booking_Value_std': 'float64', 'Booking_Value_norm': 'float64',
    'Ride_Distance_std': 'float64', 'Ride_Distance_norm': 'float64',
    'Driver_Ratings_std': 'float64', 'Driver_Ratings_norm': 'float64',
    'Customer_Rating_std': 'float64', 'Customer_Rating_norm': 'float64',
    'Hour': 'int32',
    'Day_of_Week': 'category',
    'Is_Weekend': 'bool',
    'Time_of_Day': 'category',
    'Route': 'string',
    'Is_Same_Location': 'bool',
    'Value_per_Distance': 'float64',
    'High_Value_Ride': 'bool',
    'Incomplete_Rides_Reason_Encoded': 'int32'
}
print("\nData Types Validation:")
for col, dtype in expected_dtypes.items():
    if col in data_validated.columns:
        actual_dtype = str(data_validated[col].dtype)
        if actual_dtype != dtype:
            print(f"Warning: {col} has type {actual_dtype}, expected {dtype}")
        else:
            print(f"{col}: Type {actual_dtype} matches expected {dtype}")
    else:
        print(f"Warning: {col} not found in dataset.")


In [None]:
# Step 8: Validate numeric ranges
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
print("\nNumeric Ranges Check:")
for col in numeric_columns:
    if col in data_validated.columns:
        min_val = data_validated[col].min()
        max_val = data_validated[col].max()
        print(f"{col}: Min = {min_val:.2f}, Max = {max_val:.2f}")
        if col in ['Driver_Ratings', 'Customer_Rating']:
            assert 0 <= min_val <= max_val <= 5, f"{col} out of range (0–5)"
        else:
            assert min_val >= 0, f"{col} has negative values"

# Validate standardized and normalized columns
std_columns = [f'{col}_std' for col in numeric_columns if f'{col}_std' in data_validated.columns]
norm_columns = [f'{col}_norm' for col in numeric_columns if f'{col}_norm' in data_validated.columns]
print("\nStandardized Columns Check (Mean ≈ 0, Std ≈ 1):")
display(data_validated[std_columns].describe())
print("\nNormalized Columns Check (Range [0, 1]):")
display(data_validated[norm_columns].describe())

In [None]:
# Step 9: Validate engineered features
print("\nEngineered Features Validation:")
# Check Hour (0–23)
if 'Hour' in data_validated.columns:
    assert data_validated['Hour'].between(0, 23).all(), "Hour values out of range (0–23)"

# Check Is_Weekend aligns with Day_of_Week
if 'Is_Weekend' in data_validated.columns and 'Day_of_Week' in data_validated.columns:
    weekend_days = ['Saturday', 'Sunday']
    assert (data_validated[data_validated['Is_Weekend']]['Day_of_Week'].isin(weekend_days)).all(), "Is_Weekend mismatch with Day_of_Week"

# Check Time_of_Day aligns with Hour
if 'Time_of_Day' in data_validated.columns and 'Hour' in data_validated.columns:
    time_check = (
        ((data_validated['Hour'] < 6) & (data_validated['Time_of_Day'] == 'Night')) |
        ((data_validated['Hour'].between(6, 11)) & (data_validated['Time_of_Day'] == 'Morning')) |
        ((data_validated['Hour'].between(12, 17)) & (data_validated['Time_of_Day'] == 'Afternoon')) |
        ((data_validated['Hour'].between(18, 23)) & (data_validated['Time_of_Day'] == 'Evening'))
    )
    assert time_check.all(), "Time_of_Day mismatch with Hour"

# Check Route matches Pickup_Location and Drop_Location
if 'Route' in data_validated.columns and 'Pickup_Location' in data_validated.columns and 'Drop_Location' in data_validated.columns:
    route_check = (data_validated['Route'] == data_validated['Pickup_Location'] + ' to ' + data_validated['Drop_Location'])
    assert route_check.all(), "Route does not match Pickup_Location and Drop_Location"

# Check Is_Same_Location
if 'Is_Same_Location' in data_validated.columns:
    same_location_check = (data_validated['Is_Same_Location'] == (data_validated['Pickup_Location'] == data_validated['Drop_Location']))
    assert same_location_check.all(), "Is_Same_Location mismatch"

# Check Value_per_Distance
if 'Value_per_Distance' in data_validated.columns:
    assert data_validated['Value_per_Distance'].min() >= 0, "Value_per_Distance has negative values"

# Check High_Value_Ride
if 'High_Value_Ride' in data_validated.columns and 'Booking_Value' in data_validated.columns:
    threshold = data_validated['Booking_Value'].quantile(0.75)
    assert (data_validated['High_Value_Ride'] == (data_validated['Booking_Value'] > threshold)).all(), "High_Value_Ride mismatch"

In [None]:
# Step 10: Logical consistency checks
print("\nLogical Consistency Checks:")
# Modified check for Incomplete_Rides and Incomplete_Rides_Reason
if 'Incomplete_Rides' in data_validated.columns and 'Incomplete_Rides_Reason' in data_validated.columns:
    problematic_rows_after = data_validated[
        (data_validated['Incomplete_Rides'] == 'Yes') & 
        (data_validated['Incomplete_Rides_Reason'].isna() | 
         data_validated['Incomplete_Rides_Reason'].isin(['None', '']) | 
         data_validated['Incomplete_Rides_Reason'].str.strip().eq(''))
    ]
    if not problematic_rows_after.empty:
        print("Warning: Some rows still have Incomplete_Rides = 'Yes' with invalid Incomplete_Rides_Reason:")
        display(problematic_rows_after[['Incomplete_Rides', 'Incomplete_Rides_Reason']])
    else:
        print("Incomplete_Rides and Incomplete_Rides_Reason are consistent.")

# Success rides should generally have non-zero Booking_Value and Ride_Distance
if 'Booking_Status' in data_validated.columns:
    success_rides = data_validated[data_validated['Booking_Status'] == 'Success']
    value_check = (success_rides['Booking_Value'] > 0).all()
    distance_check = (success_rides['Ride_Distance'] > 0).all()
    print(f"Success rides with non-zero Booking_Value: {value_check}")
    print(f"Success rides with non-zero Ride_Distance: {distance_check}")



In [None]:
# Step 11: Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_validated.columns:
        if data_validated[col].dtype in ['string', 'object'] or data_validated[col].isna().any():
            print(f"\n{col}: Contains text or mixed data. Sample values:")
            display(data_validated[col].value_counts(dropna=False).head())
        else:
            print(f"\n{col}: Numeric. Range:")
            print(f"Min = {data_validated[col].min():.2f}, Max = {data_validated[col].max():.2f}")

In [None]:
# Step 12: Save the validated dataset
data_validated.to_csv('bookings_validated.csv', index=False)
print("\nDataset validated and saved as 'bookings_validated.csv'")