In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset from the previous step
# Replace 'bookings_no_duplicates.csv' with the actual file path if not in the same directory
file_path = 'bookings_no_duplicates.csv'
data_no_duplicates = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_corrected = data_no_duplicates.copy()

In [None]:
# Step 1: Inspect current data types
print("Current Data Types:")
display(data_corrected.dtypes)
print("\nSample Values for All Columns:")
display(data_corrected.head())
print("\nSample Values for Canceled_Rides_by_Customer and Canceled_Rides_by_Driver:")
display(data_corrected[['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']].head(10))

In [None]:
# Step 2: Handle Date and Time columns
if 'Datetime' in data_corrected.columns:
    # If Datetime exists, ensure it's datetime64
    data_corrected['Datetime'] = pd.to_datetime(data_corrected['Datetime'], errors='coerce')
    print("\nDatetime column already exists and is converted to datetime64.")
elif 'Date' in data_corrected.columns and 'Time' in data_corrected.columns:
    # If Date and Time exist, convert to Datetime
    if data_corrected['Date'].dtype == 'object' or data_corrected['Date'].dtype == 'datetime64[ns]':
        data_corrected['Datetime'] = pd.to_datetime(data_corrected['Date'], errors='coerce')
    else:
        # Fallback for Excel numeric format
        data_corrected['Datetime'] = pd.to_datetime(data_corrected['Date'], unit='d', origin='1899-12-30') + pd.to_timedelta(data_corrected['Time'], unit='d')
    data_corrected.drop(columns=['Date', 'Time'], inplace=True)
    print("\nDate and Time columns combined into Datetime.")
else:
    print("\nWarning: Neither 'Datetime' nor 'Date'/'Time' columns found. Please check the dataset.")

In [None]:
# Step 3: Convert columns to appropriate data types
# String columns
string_columns = ['Booking_ID', 'Customer_ID', 'Pickup_Location', 'Drop_Location', 'Incomplete_Rides_Reason', 'Vehicle Images']
for col in string_columns:
    if col in data_corrected.columns:
        data_corrected[col] = data_corrected[col].astype('string')

# Categorical columns
categorical_columns = ['Booking_Status', 'Vehicle_Type', 'Payment_Method', 'Incomplete_Rides']
for col in categorical_columns:
    if col in data_corrected.columns:
        data_corrected[col] = data_corrected[col].astype('category')

# Numeric columns
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
for col in numeric_columns:
    if col in data_corrected.columns:
        data_corrected[col] = pd.to_numeric(data_corrected[col], errors='coerce').astype('float64')

# Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver if they exist
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_corrected.columns:
        # Check if the column contains non-numeric data
        if data_corrected[col].dtype == 'object' and data_corrected[col].str.contains(r'[a-zA-Z]', na=False).any():
            print(f"\nWarning: {col} contains text data. Converting to string instead of float.")
            data_corrected[col] = data_corrected[col].astype('string')
        else:
            data_corrected[col] = pd.to_numeric(data_corrected[col], errors='coerce').astype('float64')

In [None]:
# Step 4: Validate data types
print("\nData Types After Correction:")
display(data_corrected.dtypes)

In [None]:
# Step 5: Save the dataset with corrected data types
data_corrected.to_csv('bookings_corrected_datatypes.csv', index=False)
print("Dataset with corrected data types saved as 'bookings_corrected_datatypes.csv'")