In [None]:
!pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load the dataset from the previous step
# Replace 'bookings_outliers_handled.csv' with the actual file path if not in the same directory
file_path = 'bookings_outliers_handled.csv'
data_cleaned = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_processed = data_cleaned.copy()

In [None]:
# Step 1: Inspect numeric columns
numeric_columns = ['V_TAT', 'C_TAT', 'Booking_Value', 'Ride_Distance', 'Driver_Ratings', 'Customer_Rating']
print("Summary Statistics Before Standardization/Normalization:")
display(data_processed[numeric_columns].describe())

In [None]:
# Step 2: Initialize scalers
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

In [None]:
# Step 3: Standardize and normalize numeric columns
for col in numeric_columns:
    # Standardize: Create new column with _std suffix
    data_processed[f'{col}_std'] = standard_scaler.fit_transform(data_processed[[col]])
    # Normalize: Create new column with _norm suffix
    data_processed[f'{col}_norm'] = minmax_scaler.fit_transform(data_processed[[col]])

In [None]:
# Step 4: Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver if they exist and are numeric
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_processed.columns and data_processed[col].dtype == 'float64':
        data_processed[f'{col}_std'] = standard_scaler.fit_transform(data_processed[[col]])
        data_processed[f'{col}_norm'] = minmax_scaler.fit_transform(data_processed[[col]])
        print(f"\nStandardized and normalized {col}")
    elif col in data_processed.columns:
        print(f"\nSkipping {col}: Contains non-numeric data (string).")

In [None]:
# Step 5: Validate transformations
print("\nSummary Statistics for Standardized Columns:")
std_columns = [f'{col}_std' for col in numeric_columns]
display(data_processed[std_columns].describe())

print("\nSummary Statistics for Normalized Columns:")
norm_columns = [f'{col}_norm' for col in numeric_columns]
display(data_processed[norm_columns].describe())

In [None]:
# Step 6: Save the dataset with standardized and normalized columns
data_processed.to_csv('bookings_standardized_normalized.csv', index=False)
print("Dataset with standardized and normalized columns saved as 'bookings_standardized_normalized.csv'")