In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import LabelEncoder

# Load the dataset from the previous step
# Replace 'bookings_standardized_normalized.csv' with the actual file path if not in the same directory
file_path = 'bookings_standardized_normalized.csv'
data_processed = pd.read_csv(file_path)

# Create a copy of the dataset to preserve the loaded data
data_fe = data_processed.copy()

In [None]:
# Step 1: Temporal Features from Datetime
# Ensure Datetime is in datetime64 format
data_fe['Datetime'] = pd.to_datetime(data_fe['Datetime'], errors='coerce')

# Extract Hour, Day of Week, and Is_Weekend
data_fe['Hour'] = data_fe['Datetime'].dt.hour
data_fe['Day_of_Week'] = data_fe['Datetime'].dt.day_name()
data_fe['Is_Weekend'] = data_fe['Datetime'].dt.dayofweek >= 5  # Saturday (5) or Sunday (6)

# Create Time_of_Day (Morning, Afternoon, Evening, Night)
bins = [0, 6, 12, 18, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening']
data_fe['Time_of_Day'] = pd.cut(data_fe['Hour'], bins=bins, labels=labels, right=False, include_lowest=True)

# Convert Day_of_Week and Time_of_Day to category
data_fe['Day_of_Week'] = data_fe['Day_of_Week'].astype('category')
data_fe['Time_of_Day'] = data_fe['Time_of_Day'].astype('category')

In [None]:
# Step 2: Location-Based Features
# Create Route by combining Pickup_Location and Drop_Location
data_fe['Route'] = data_fe['Pickup_Location'] + ' to ' + data_fe['Drop_Location']
data_fe['Route'] = data_fe['Route'].astype('string')

# Create Is_Same_Location (True if Pickup_Location equals Drop_Location)
data_fe['Is_Same_Location'] = data_fe['Pickup_Location'] == data_fe['Drop_Location']
data_fe['Is_Same_Location'] = data_fe['Is_Same_Location'].astype('bool')

In [None]:
# Step 3: Ride Metrics
# Calculate Value_per_Distance (handle division by zero)
data_fe['Value_per_Distance'] = data_fe['Booking_Value'] / data_fe['Ride_Distance'].replace(0, np.nan)
data_fe['Value_per_Distance'] = data_fe['Value_per_Distance'].fillna(0).astype('float64')

# Create High_Value_Ride (True if Booking_Value is above 75th percentile)
booking_value_threshold = data_fe['Booking_Value'].quantile(0.75)
data_fe['High_Value_Ride'] = data_fe['Booking_Value'] > booking_value_threshold
data_fe['High_Value_Ride'] = data_fe['High_Value_Ride'].astype('bool')

In [None]:
# Step 4: Categorical Encoding
# One-hot encode Booking_Status, Vehicle_Type, and Payment_Method
categorical_columns = ['Booking_Status', 'Vehicle_Type', 'Payment_Method']
for col in categorical_columns:
    if col in data_fe.columns:
        dummies = pd.get_dummies(data_fe[col], prefix=col, drop_first=True)
        data_fe = pd.concat([data_fe, dummies], axis=1)

# Label encode Incomplete_Rides_Reason (optional, as it’s text-heavy)
if 'Incomplete_Rides_Reason' in data_fe.columns:
    le = LabelEncoder()
    data_fe['Incomplete_Rides_Reason_Encoded'] = le.fit_transform(data_fe['Incomplete_Rides_Reason'].astype(str))
    data_fe['Incomplete_Rides_Reason_Encoded'] = data_fe['Incomplete_Rides_Reason_Encoded'].astype('int32')

In [None]:
# Step 5: Handle Canceled_Rides_by_Customer and Canceled_Rides_by_Driver
for col in ['Canceled_Rides_by_Customer', 'Canceled_Rides_by_Driver']:
    if col in data_fe.columns:
        if data_fe[col].dtype == 'string':
            print(f"\nSkipping {col}: Contains non-numeric data (string).")
        elif data_fe[col].dtype == 'float64':
            # Create binary feature for cancellations (True if value > 0)
            data_fe[f'{col}_Binary'] = data_fe[col] > 0
            data_fe[f'{col}_Binary'] = data_fe[f'{col}_Binary'].astype('bool')
            print(f"\nCreated binary feature for {col}")

In [None]:
# Step 6: Validate new features
print("\nNew Columns Added:")
display(data_fe.columns.tolist())
print("\nSample Data with New Features:")
display(data_fe[['Datetime', 'Hour', 'Day_of_Week', 'Is_Weekend', 'Time_of_Day', 
                 'Route', 'Is_Same_Location', 'Value_per_Distance', 'High_Value_Ride']].head())

print("\nData Types of New Features:")
display(data_fe[['Hour', 'Day_of_Week', 'Is_Weekend', 'Time_of_Day', 
                 'Route', 'Is_Same_Location', 'Value_per_Distance', 'High_Value_Ride']].dtypes)

In [None]:
# Step 7: Save the dataset with new features
data_fe.to_csv('bookings_feature_engineered.csv', index=False)
print("Dataset with feature engineering saved as 'bookings_feature_engineered.csv'")