<a href="https://colab.research.google.com/github/Sai1116/MathMinds_E101/blob/main/hack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report

import joblib


In [5]:
np.random.seed(42)

zones = [
    "Gandhipuram",
    "Peelamedu",
    "RS_Puram",
    "Saibaba_Colony",
    "Ukkadam",
    "Singanallur"
]

rows = []
start_date = datetime(2024, 3, 1)

for _ in range(1000):

    date = start_date + timedelta(days=np.random.randint(0, 14))
    hour = np.random.randint(6, 23)

    pickup_zone = random.choice(zones)
    dropoff_zone = random.choice(zones)

    zone_demand_level = random.choice(["Low", "Medium", "High"])
    zone_worker_density = random.choice(["Low", "Medium", "High"])

    traffic_index = round(np.random.uniform(0.9, 1.6), 2)

    trip_distance_km = round(np.random.gamma(2.0, 2.5), 2)
    trip_duration_min = round(trip_distance_km * traffic_index * np.random.uniform(3, 5), 1)
    pickup_delay_min = round(np.random.uniform(2, 10) * traffic_index, 1)

    worker_rating = round(np.random.normal(4.6, 0.2), 2)
    worker_rating = min(max(worker_rating, 4.0), 5.0)
    acceptance_rate = round(np.random.uniform(0.6, 0.95), 2)

    base_fare = round(40 + trip_distance_km * 4, 2)
    surge_multiplier = 1.0
    incentive_bonus = 0

    if zone_demand_level == "High" and hour in [8, 9, 18, 19, 20]:
        surge_multiplier = round(np.random.uniform(1.2, 1.7), 2)
        incentive_bonus = random.choice([10, 20, 30])

    # ðŸ”´ Undocumented platform change
    if date >= datetime(2024, 3, 10) and pickup_zone in ["Ukkadam", "Singanallur"]:
        surge_multiplier *= 0.8

    total_fare = round(base_fare * surge_multiplier + incentive_bonus, 2)

    rows.append([
        date.strftime("%Y-%m-%d"),
        hour,
        pickup_zone,
        dropoff_zone,
        trip_distance_km,
        trip_duration_min,
        pickup_delay_min,
        traffic_index,
        worker_rating,
        acceptance_rate,
        zone_worker_density,
        zone_demand_level,
        base_fare,
        surge_multiplier,
        incentive_bonus,
        total_fare
    ])

columns = [
    "date", "hour", "pickup_zone", "dropoff_zone",
    "trip_distance_km", "trip_duration_min", "pickup_delay_min",
    "traffic_index", "worker_rating", "acceptance_rate",
    "zone_worker_density", "zone_demand_level",
    "base_fare", "surge_multiplier", "incentive_bonus", "total_fare"
]

df = pd.DataFrame(rows, columns=columns)
df.to_csv("gig_dataset.csv", index=False)


In [6]:
df = pd.read_csv("gig_dataset.csv")

encoders = {}

for col in ["pickup_zone", "zone_worker_density", "zone_demand_level"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le


In [7]:
# Behavior-based target
df["assigned"] = (
    (df["pickup_delay_min"] < 6) &
    (df["acceptance_rate"] > 0.75)
).astype(int)

X_assign = df[
    [
        "hour",
        "pickup_zone",
        "zone_demand_level",
        "zone_worker_density",
        "traffic_index",
        "pickup_delay_min",
    ]
]

y_assign = df["assigned"]

X_train, X_test, y_train, y_test = train_test_split(
    X_assign, y_assign, test_size=0.2, stratify=y_assign, random_state=42
)

assignment_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=7,
    random_state=42
)

assignment_model.fit(X_train, y_train)

print("MODEL 1: ASSIGNMENT AVAILABILITY\n")
print(classification_report(y_test, assignment_model.predict(X_test)))


MODEL 1: ASSIGNMENT AVAILABILITY

              precision    recall  f1-score   support

           0       0.86      0.87      0.86       159
           1       0.46      0.44      0.45        41

    accuracy                           0.78       200
   macro avg       0.66      0.65      0.66       200
weighted avg       0.78      0.78      0.78       200



In [8]:
# Target definition (FIX)
df["incentive_active"] = (df["incentive_bonus"] > 0).astype(int)

X_inc = X_assign.copy()
y_inc = df["incentive_active"]

X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
    X_inc, y_inc, test_size=0.2, stratify=y_inc, random_state=42
)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    min_samples_leaf=10,
    class_weight="balanced",
    random_state=42
)

incentive_model = CalibratedClassifierCV(
    rf,
    method="isotonic",
    cv=3
)

incentive_model.fit(X_train_i, y_train_i)

probs = incentive_model.predict_proba(X_test_i)[:, 1]
THRESHOLD = 0.45
y_pred_i = (probs >= THRESHOLD).astype(int)

print("MODEL 2: INCENTIVE PREDICTION\n")
print(classification_report(y_test_i, y_pred_i))


MODEL 2: INCENTIVE PREDICTION

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       181
           1       1.00      0.95      0.97        19

    accuracy                           0.99       200
   macro avg       1.00      0.97      0.99       200
weighted avg       1.00      0.99      0.99       200



In [9]:
fairness = (
    df.groupby("pickup_zone")
    .agg({
        "total_fare": "mean",
        "assigned": "mean",
        "incentive_bonus": "mean"
    })
    .rename(columns={"assigned": "assignment_rate"})
)

fairness["pay_gap_vs_best"] = (
    fairness["total_fare"].max() - fairness["total_fare"]
)

print("FAIRNESS SUMMARY\n")
print(fairness)


FAIRNESS SUMMARY

             total_fare  assignment_rate  incentive_bonus  pay_gap_vs_best
pickup_zone                                                               
0             63.277128         0.159574         1.276596         1.816412
1             64.012062         0.193750         1.812500         1.081477
2             65.093539         0.230337         2.471910         0.000000
3             63.797324         0.190141         1.549296         1.296215
4             60.923176         0.252941         2.352941         4.170363
5             62.733333         0.197531         2.716049         2.360206


In [10]:
joblib.dump(assignment_model, "assignment_model.pkl")
joblib.dump(incentive_model, "incentive_model.pkl")
joblib.dump(encoders, "encoders.pkl")

print("âœ… Models and encoders saved successfully")


âœ… Models and encoders saved successfully


In [11]:
!ls


assignment_model.pkl  gig_dataset.csv	   sample_data
encoders.pkl	      incentive_model.pkl
