<a href="https://colab.research.google.com/github/Sai1116/MathMinds_E101/blob/main/backend_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [158]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report

import joblib

In [159]:
import numpy as np
import pandas as pd
from datetime import timedelta
import random

np.random.seed(42)
random.seed(42)

# ============================================================
# LOAD REAL DATA
# ============================================================

real = raw.copy()

real["Date"] = pd.to_datetime(real["Date"])
real["hour"] = real["Date"].dt.hour

real["pickup_zone"] = (
    real["Pickup Location"]
    .str.lower()
    .str.replace(" ", "_")
)

TOP_ZONES = real["pickup_zone"].value_counts().nlargest(8).index
real["pickup_zone"] = real["pickup_zone"].where(
    real["pickup_zone"].isin(TOP_ZONES),
    "other_zone"
)

# ============================================================
# LEARN DISTRIBUTIONS (FIXED)
# ============================================================

zone_hour_stats = (
    real.groupby(["pickup_zone", "hour"])
    .agg(
        dist_mean=("Ride Distance", "mean"),
        dist_std=("Ride Distance", "std"),
        vtat_mean=("Avg VTAT", "mean"),
        vtat_std=("Avg VTAT", "std"),
        ctat_mean=("Avg CTAT", "mean"),
        fare_mean=("Booking Value", "mean"),
        rating_mean=("Driver Ratings", "mean"),
        count=("Booking ID", "count")
    )
    .ffill()   # âœ… FIXED
)

# Precompute quantiles ONCE
q33 = zone_hour_stats["count"].quantile(0.33)
q66 = zone_hour_stats["count"].quantile(0.66)

# ============================================================
# SYNTHETIC GENERATION
# ============================================================

ROWS = []
START_DATE = real["Date"].min()
N_SYNTH = 5000

for _ in range(N_SYNTH):

    zone, hour = random.choice(zone_hour_stats.index.tolist())
    base = zone_hour_stats.loc[(zone, hour)]

    date = START_DATE + timedelta(days=random.randint(0, 20))

    trip_distance = max(
        0.5,
        np.random.normal(base["dist_mean"], base["dist_std"] * 0.6)
    )

    traffic_index = np.clip(
        np.random.normal(1.0 + base["vtat_mean"] / 10, 0.15),
        0.8, 1.7
    )

    pickup_delay = max(
        2,
        np.random.normal(base["vtat_mean"], base["vtat_std"] * 0.6)
    )

    trip_duration = trip_distance * traffic_index * np.random.uniform(3.5, 5)

    worker_rating = np.clip(
        np.random.normal(base["rating_mean"], 0.25),
        3.8, 5.0
    )

    acceptance_rate = np.clip(
        np.random.beta(8, 2),
        0.6, 0.98
    )

    # âœ… FIXED demand logic
    if base["count"] > q66:
        demand = "High"
    elif base["count"] > q33:
        demand = "Medium"
    else:
        demand = "Low"

    worker_density = random.choice(["Low", "Medium", "High"])

    base_fare = 40 + trip_distance * 4

    surge = 1.0
    incentive_bonus = 0

    if demand == "High" and hour in [8, 9, 18, 19, 20]:
        surge = np.random.uniform(1.2, 1.7)
        incentive_bonus = random.choice([10, 20, 30])

    if date >= START_DATE + timedelta(days=12) and zone == "other_zone":
        surge *= 0.85

    total_fare = base_fare * surge + incentive_bonus

    ROWS.append([
        date.strftime("%Y-%m-%d"),
        hour,
        zone,
        random.choice(TOP_ZONES.tolist()),
        round(trip_distance, 2),
        round(trip_duration, 1),
        round(pickup_delay, 1),
        round(traffic_index, 2),
        round(worker_rating, 2),
        round(acceptance_rate, 2),
        worker_density,
        demand,
        round(base_fare, 2),
        round(surge, 2),
        incentive_bonus,
        round(total_fare, 2)
    ])

# ============================================================
# FINAL DATAFRAME
# ============================================================

columns = [
    "date", "hour", "pickup_zone", "dropoff_zone",
    "trip_distance_km", "trip_duration_min", "pickup_delay_min",
    "traffic_index", "worker_rating", "acceptance_rate",
    "zone_worker_density", "zone_demand_level",
    "base_fare", "surge_multiplier", "incentive_bonus", "total_fare"
]

synthetic_df = pd.DataFrame(ROWS, columns=columns)
synthetic_df.to_csv("synthetic_gig_dataset.csv", index=False)

print("âœ… Synthetic dataset generated:", synthetic_df.shape)
print(synthetic_df.head())


âœ… Synthetic dataset generated: (5000, 16)
         date  hour pickup_zone     dropoff_zone  trip_distance_km  \
0  2024-01-01     0    badarpur   pragati_maidan             28.62   
1  2024-01-08     0     khandsa  barakhamba_road             19.48   
2  2024-01-03     0       saket            aiims             16.88   
3  2024-01-01     0       aiims         badarpur             19.57   
4  2024-01-17     0     khandsa          khandsa             31.81   

   trip_duration_min  pickup_delay_min  traffic_index  worker_rating  \
0              181.7              10.2           1.70           4.58   
1              142.0               7.1           1.70           4.00   
2              105.6              11.9           1.65           4.20   
3              138.4               8.9           1.70           4.05   
4              227.5               8.8           1.66           3.80   

   acceptance_rate zone_worker_density zone_demand_level  base_fare  \
0             0.72             

In [160]:
df = pd.read_csv("gig_dataset.csv")

encoders = {}

for col in ["pickup_zone", "zone_worker_density", "zone_demand_level"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

In [162]:
# =========================
# MODEL 1 (IMPROVED): ASSIGNMENT AVAILABILITY
# =========================

# Target (unchanged, still behavior-based)
df["assigned"] = (
    (df["pickup_delay_min"] < 6) &
    (df["acceptance_rate"] > 0.75)
).astype(int)

# ðŸ”¥ Improved feature set (NO LEAKAGE)
X_assign = df[
    [
        "hour",
        "pickup_zone",
        "zone_demand_level",
        "zone_worker_density",
        "traffic_index",
        "pickup_delay_min",
        "acceptance_rate",      # NEW
        "worker_rating",        # NEW
        "trip_distance_km"      # NEW
    ]
]

y_assign = df["assigned"]

X_train, X_test, y_train, y_test = train_test_split(
    X_assign,
    y_assign,
    test_size=0.2,
    stratify=y_assign,
    random_state=42
)

assignment_model = RandomForestClassifier(
    n_estimators=300,          # more stable
    max_depth=9,               # captures interactions
    min_samples_leaf=5,        # prevents noise fitting
    class_weight="balanced",   # handles imbalance
    random_state=42,
    n_jobs=-1                  # ðŸ”¥ efficiency boost
)

assignment_model.fit(X_train, y_train)

print("IMPROVED MODEL 1: ASSIGNMENT AVAILABILITY\n")
print(classification_report(y_test, assignment_model.predict(X_test)))

IMPROVED MODEL 1: ASSIGNMENT AVAILABILITY

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       158
           1       1.00      1.00      1.00        42

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [167]:
# Target definition (FIX)
df["incentive_active"] = (df["incentive_bonus"] > 0).astype(int)

X_inc = X_assign.copy()
y_inc = df["incentive_active"]

X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
    X_inc, y_inc, test_size=0.2, stratify=y_inc, random_state=42
)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    min_samples_leaf=10,
    class_weight="balanced",
    random_state=42
)

incentive_model = CalibratedClassifierCV(
    rf,
    method="isotonic",
    cv=3
)

incentive_model.fit(X_train_i, y_train_i)

probs = incentive_model.predict_proba(X_test_i)[:, 1]
THRESHOLD = 0.45
y_pred_i = (probs >= THRESHOLD).astype(int)

print("MODEL 2: INCENTIVE PREDICTION\n")
print(classification_report(y_test_i, y_pred_i))

MODEL 2: INCENTIVE PREDICTION

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       179
           1       0.95      0.90      0.93        21

    accuracy                           0.98       200
   macro avg       0.97      0.95      0.96       200
weighted avg       0.98      0.98      0.98       200



In [163]:
#====================
     # model 3
#====================
df["date"] = pd.to_datetime(df["date"])
df["pay_per_km"] = df["total_fare"] / df["trip_distance_km"]

zone_day = (
    df.groupby(["pickup_zone", "date"])
    .agg(
        avg_fare=("total_fare", "mean"),
        avg_paykm=("pay_per_km", "mean"),
        avg_surge=("surge_multiplier", "mean")
    )
    .reset_index()
    .sort_values(["pickup_zone", "date"])
)

WINDOW = 5

zone_day["baseline_fare"] = (
    zone_day.groupby("pickup_zone")["avg_fare"]
    .transform(lambda x: x.rolling(WINDOW, min_periods=3).mean())
)

zone_day["baseline_surge"] = (
    zone_day.groupby("pickup_zone")["avg_surge"]
    .transform(lambda x: x.rolling(WINDOW, min_periods=3).mean())
)

zone_day["fare_drop"] = (
    (zone_day["baseline_fare"] - zone_day["avg_fare"])
    / zone_day["baseline_fare"]
) > 0.15

zone_day["surge_drop"] = (
    (zone_day["baseline_surge"] - zone_day["avg_surge"])
    / zone_day["baseline_surge"]
) > 0.20

zone_day["platform_adaptation"] = (
    zone_day["fare_drop"].astype(int)
    + zone_day["surge_drop"].astype(int)
) >= 1

print("MODEL 3 â€” PLATFORM ADAPTATION EVENTS\n")
print(zone_day[zone_day["platform_adaptation"]].head())

MODEL 3 â€” PLATFORM ADAPTATION EVENTS

    pickup_zone       date   avg_fare  avg_paykm  avg_surge  baseline_fare  \
26            1 2024-03-13  56.720000  18.369327        1.0      66.929251   
41            2 2024-03-14  52.975385  27.210103        1.0      62.853166   
65            4 2024-03-10  44.380000  21.227806        0.8      59.084777   
79            5 2024-03-10  47.822500  10.057788        0.8      61.314861   
83            5 2024-03-14  43.120714  18.218697        0.8      51.533782   

    baseline_surge  fare_drop  surge_drop  platform_adaptation  
26        1.051303       True       False                 True  
41        1.045144       True       False                 True  
65        0.986984       True       False                 True  
79        0.977143       True       False                 True  
83        0.829656       True       False                 True  


In [164]:
# ----- ensure incentive_active exists -----
df["incentive_active"] = (df["incentive_bonus"] > 0).astype(int)

# ----- fairness computation -----
fairness = (
    df.groupby("pickup_zone")
    .agg(
        avg_pay=("total_fare", "mean"),
        assignment_rate=("assigned", "mean"),
        incentive_rate=("incentive_active", "mean")
    )
)

fairness["pay_gap_vs_best"] = (
    fairness["avg_pay"].max() - fairness["avg_pay"]
)

print("FAIRNESS SUMMARY\n")
print(fairness)


FAIRNESS SUMMARY

               avg_pay  assignment_rate  incentive_rate  pay_gap_vs_best
pickup_zone                                                             
0            66.563988         0.154762        0.119048         0.000000
1            66.085605         0.222930        0.095541         0.478383
2            65.008962         0.240437        0.098361         1.555026
3            63.384510         0.209150        0.111111         3.179478
4            61.285350         0.248408        0.133758         5.278638
5            61.136429         0.192308        0.076923         5.427560


In [165]:
joblib.dump(assignment_model, "assignment_model.pkl")
joblib.dump(incentive_model, "incentive_model.pkl")
joblib.dump(encoders, "encoders.pkl")

print("âœ… Models and encoders saved successfully")

âœ… Models and encoders saved successfully


In [166]:
!ls

'archive (1).zip'       gig_dataset.csv		 sample_data
 assignment_model.pkl   incentive_model.pkl	 synthetic_gig_dataset.csv
 data		        model_assignment.pkl	 uber_data
 drive		        model_incentive.pkl	 uber_zip
 encoders.pkl	        platform_insights.json
 extracted_data         platform_output.json
