In [1]:
import pandas as pd
import numpy as np
import os



In [2]:
#  Load Raw Cleaned Dataset
df = pd.read_csv("/content/drive/MyDrive/MLdev/dispatchbrain/dispatch_eta_cleaned.csv")





In [3]:
#  Handle Missing Values
df['rain_mm'] = df['rain_mm'].fillna(0)
df['wind_kmph'] = df['wind_kmph'].fillna(df['wind_kmph'].median())
df['temp_c'] = df['temp_c'].fillna(df['temp_c'].mean())
df = df.dropna(subset=['created_at', 'picked', 'delivered_at'])



In [4]:
#  Remove Outliers (IQR Method)
def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] >= lower) & (data[column] <= upper)]

df = remove_outliers_iqr(df, 'eta_minutes')




In [5]:
#  Time-Based Features
df['created_at'] = pd.to_datetime(df['created_at'])
df['hour'] = df['created_at'].dt.hour
df['day_of_week'] = df['created_at'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)




In [6]:
#  Distance Buckets
df['distance_bucket'] = pd.cut(df['distance_km'], bins=[0, 5, 10, 20, 50], labels=['Very Short', 'Short', 'Medium', 'Long'])





In [7]:
#  Weather Severity Score
df['weather_severity'] = df['rain_mm']*0.6 + df['wind_kmph']*0.3 + (30 - df['temp_c'])*0.1





In [8]:
#  Log Distance
df['log_distance'] = np.log1p(df['distance_km'])



In [9]:
#  Delivery Duration
df['delivered_at'] = pd.to_datetime(df['delivered_at'])
df['picked'] = pd.to_datetime(df['picked'])
df['delivery_duration'] = (df['delivered_at'] - df['picked']).dt.total_seconds() / 60



In [10]:
#  Categorical Encoding
df = pd.get_dummies(df, columns=['service_type', 'city', 'distance_bucket'], drop_first=True)



In [11]:
#  Final Feature List
features = [
    'hour', 'day_of_week', 'is_weekend',
    'distance_km', 'log_distance',
    'weather_severity', 'delivery_duration',
    'pickup_lat', 'pickup_lon', 'drop_lat', 'drop_lon'
]


In [12]:
#  Add Encoded Columns
encoded_cols = [col for col in df.columns if col.startswith(('service_type_', 'city_', 'distance_bucket_'))]
features += encoded_cols



In [13]:
# Define X and y
X = df[features]
y = df['eta_minutes']




In [14]:
#  Save Feature Files
os.makedirs("data/features", exist_ok=True)
X.to_csv("data/features/X_features_dispatchbrain.csv", index=False)
y.to_csv("data/features/y_target_dispatchbrain.csv", index=False)

print(" Features and target saved successfully.")



 Features and target saved successfully.
