<a href="https://colab.research.google.com/github/Sai1116/MathMinds_E101/blob/main/hack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random

np.random.seed(42)

# -------------------------
# City Zones (Coimbatore-style)
# -------------------------
zones = [
    "Gandhipuram",
    "Peelamedu",
    "RS_Puram",
    "Saibaba_Colony",
    "Ukkadam",
    "Singanallur"
]

rows = []
start_date = datetime(2024, 3, 1)

for _ in range(1000):

    # Date & time
    date = start_date + timedelta(days=np.random.randint(0, 14))
    hour = np.random.randint(6, 23)

    # Zones
    pickup_zone = random.choice(zones)
    dropoff_zone = random.choice(zones)

    # Context
    zone_demand_level = random.choice(["Low", "Medium", "High"])
    zone_worker_density = random.choice(["Low", "Medium", "High"])

    traffic_index = round(np.random.uniform(0.9, 1.6), 2)

    # Trip
    trip_distance_km = round(np.random.gamma(2.0, 2.5), 2)
    trip_duration_min = round(trip_distance_km * traffic_index * np.random.uniform(3, 5), 1)
    pickup_delay_min = round(np.random.uniform(2, 10) * traffic_index, 1)

    # Worker
    worker_rating = round(np.random.normal(4.6, 0.2), 2)
    worker_rating = min(max(worker_rating, 4.0), 5.0)
    acceptance_rate = round(np.random.uniform(0.6, 0.95), 2)

    # Pay
    base_fare = round(40 + trip_distance_km * 4, 2)
    surge_multiplier = 1.0
    incentive_bonus = 0

    if zone_demand_level == "High" and hour in [8, 9, 18, 19, 20]:
        surge_multiplier = round(np.random.uniform(1.2, 1.7), 2)
        incentive_bonus = random.choice([10, 20, 30])

    total_fare = round(base_fare * surge_multiplier + incentive_bonus, 2)

    rows.append([
        date.strftime("%Y-%m-%d"),
        hour,
        pickup_zone,
        dropoff_zone,
        trip_distance_km,
        trip_duration_min,
        pickup_delay_min,
        traffic_index,
        worker_rating,
        acceptance_rate,
        zone_worker_density,
        zone_demand_level,
        base_fare,
        surge_multiplier,
        incentive_bonus,
        total_fare
    ])

# -------------------------
# Create DataFrame
# -------------------------
columns = [
    "date",
    "hour",
    "pickup_zone",
    "dropoff_zone",
    "trip_distance_km",
    "trip_duration_min",
    "pickup_delay_min",
    "traffic_index",
    "worker_rating",
    "acceptance_rate",
    "zone_worker_density",
    "zone_demand_level",
    "base_fare",
    "surge_multiplier",
    "incentive_bonus",
    "total_fare"
]

df = pd.DataFrame(rows, columns=columns)

# -------------------------
# Save dataset
# -------------------------
df.to_csv("gig_dataset.csv", index=False)

print("Dataset generated: gig_dataset.csv")
print(df.head())


Dataset generated: gig_dataset.csv
         date  hour  pickup_zone    dropoff_zone  trip_distance_km  \
0  2024-03-07    20      Ukkadam        RS_Puram              1.51   
1  2024-03-08     8     RS_Puram        RS_Puram              2.56   
2  2024-03-01    15  Gandhipuram       Peelamedu              4.93   
3  2024-03-07    14    Peelamedu  Saibaba_Colony              2.51   
4  2024-03-13    13      Ukkadam     Singanallur              3.47   

   trip_duration_min  pickup_delay_min  traffic_index  worker_rating  \
0                6.6              12.6           1.41           4.66   
1                8.4               5.6           0.91           4.49   
2               22.8               5.3           0.93           4.45   
3               10.4               5.2           0.95           4.79   
4               11.6               5.7           0.92           4.68   

   acceptance_rate zone_worker_density zone_demand_level  base_fare  \
0             0.81                 Low  

In [None]:
# =========================
# MODEL 1: ASSIGNMENT AVAILABILITY
# =========================

import pandas as pd
import numpy as np

# -------------------------
# 1. Load dataset
# -------------------------
df = pd.read_csv("gig_dataset.csv")

# -------------------------
# 2. Encode categorical features
# -------------------------
from sklearn.preprocessing import LabelEncoder

cat_cols = ["pickup_zone", "zone_worker_density", "zone_demand_level"]
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# -------------------------
# 3. Create target variable
# Assignment Availability Label
# 0 = Low, 1 = Medium, 2 = High
# -------------------------
def assignment_label(row):
    if row["zone_demand_level"] == 2 and row["zone_worker_density"] == 0:
        return 2  # High availability
    elif row["zone_demand_level"] == 0 and row["zone_worker_density"] == 2:
        return 0  # Low availability
    else:
        return 1  # Medium availability

df["assignment_level"] = df.apply(assignment_label, axis=1)

# -------------------------
# 4. Define features (X) and target (y)
# -------------------------
X = df[
    [
        "hour",
        "pickup_zone",
        "zone_demand_level",
        "zone_worker_density",
        "traffic_index",
        "pickup_delay_min",
    ]
]

y = df["assignment_level"]

# -------------------------
# 5. Train-test split
# -------------------------
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# -------------------------
# 6. Train Random Forest model
# -------------------------
from sklearn.ensemble import RandomForestClassifier

model_assignment = RandomForestClassifier(
    n_estimators=150,
    max_depth=8,
    random_state=42
)

model_assignment.fit(X_train, y_train)

# -------------------------
# 7. Evaluate model
# -------------------------
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model_assignment.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# -------------------------
# 8. Feature importance (for transparency)
# -------------------------
feature_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": model_assignment.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\nFeature Importance:\n")
print(feature_importance)

# -------------------------
# 9. Example prediction (single context)
# -------------------------
example_input = pd.DataFrame([{
    "hour": 19,
    "pickup_zone": encoders["pickup_zone"].transform(["Gandhipuram"])[0],
    "zone_demand_level": encoders["zone_demand_level"].transform(["High"])[0],
    "zone_worker_density": encoders["zone_worker_density"].transform(["Medium"])[0],
    "traffic_index": 1.4,
    "pickup_delay_min": 6.5
}])

prediction = model_assignment.predict(example_input)[0]

label_map = {0: "Low", 1: "Medium", 2: "High"}
print("\nPredicted Assignment Availability:", label_map[prediction])


Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       1.00      1.00      1.00       157
           2       1.00      1.00      1.00        19

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Confusion Matrix:

[[ 24   0   0]
 [  0 157   0]
 [  0   0  19]]

Feature Importance:

               feature  importance
2    zone_demand_level    0.439950
3  zone_worker_density    0.426909
5     pickup_delay_min    0.045249
4        traffic_index    0.043174
0                 hour    0.027269
1          pickup_zone    0.017449

Predicted Assignment Availability: Low


In [None]:
# =========================
# FINAL INCENTIVE MODEL 2 (CALIBRATED)
# =========================

from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np

# Target
y_incentive = df["incentive_active"]

# Train-test split
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
    X,
    y_incentive,
    test_size=0.2,
    random_state=42,
    stratify=y_incentive
)

# Base model
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    min_samples_leaf=10,     # ⬅ reduces overconfidence
    class_weight="balanced",
    random_state=42
)

# Probability calibration
incentive_model = CalibratedClassifierCV(
    rf,
    method="isotonic",
    cv=3
)

incentive_model.fit(X_train_i, y_train_i)

# -------------------------
# Threshold tuning
# -------------------------
probs = incentive_model.predict_proba(X_test_i)[:, 1]

THRESHOLD = 0.45   # ⬅ tuned for better precision
y_pred_i = (probs >= THRESHOLD).astype(int)

print("\nFINAL CALIBRATED INCENTIVE MODEL RESULTS")
print(classification_report(y_test_i, y_pred_i))



FINAL CALIBRATED INCENTIVE MODEL RESULTS
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       179
           1       1.00      0.86      0.92        21

    accuracy                           0.98       200
   macro avg       0.99      0.93      0.96       200
weighted avg       0.99      0.98      0.98       200



In [None]:
# =========================
# MODEL 3: PLATFORM ADAPTATION (CHANGE DETECTION)
# =========================

import pandas as pd
import numpy as np

# -------------------------
# 1. Load dataset
# -------------------------
df = pd.read_csv("gig_dataset.csv")
df["date"] = pd.to_datetime(df["date"])

# -------------------------
# 2. Derived metrics
# -------------------------
df["pay_per_km"] = df["total_fare"] / df["trip_distance_km"]

# -------------------------
# 3. Aggregate by zone + day
# -------------------------
zone_day = (
    df.groupby(["pickup_zone", "date"])
    .agg({
        "total_fare": "mean",
        "pay_per_km": "mean",
        "surge_multiplier": "mean"
    })
    .reset_index()
    .sort_values(["pickup_zone", "date"])
)

# -------------------------
# 4. Rolling baselines (normal behaviour)
# -------------------------
WINDOW = 5  # days

zone_day["baseline_fare"] = (
    zone_day.groupby("pickup_zone")["total_fare"]
    .transform(lambda x: x.rolling(WINDOW, min_periods=3).mean())
)

zone_day["baseline_paykm"] = (
    zone_day.groupby("pickup_zone")["pay_per_km"]
    .transform(lambda x: x.rolling(WINDOW, min_periods=3).mean())
)

zone_day["baseline_surge"] = (
    zone_day.groupby("pickup_zone")["surge_multiplier"]
    .transform(lambda x: x.rolling(WINDOW, min_periods=3).mean())
)

# -------------------------
# 5. Detect deviations
# -------------------------
PAY_DROP = 0.15      # 15% drop
SURGE_DROP = 0.20    # 20% drop

zone_day["fare_drop"] = (
    (zone_day["baseline_fare"] - zone_day["total_fare"])
    / zone_day["baseline_fare"]
) > PAY_DROP

zone_day["paykm_drop"] = (
    (zone_day["baseline_paykm"] - zone_day["pay_per_km"])
    / zone_day["baseline_paykm"]
) > PAY_DROP

zone_day["surge_drop"] = (
    (zone_day["baseline_surge"] - zone_day["surge_multiplier"])
    / zone_day["baseline_surge"]
) > SURGE_DROP

# -------------------------
# 6. Platform adaptation flag
# -------------------------
zone_day["platform_adaptation"] = (
    zone_day["fare_drop"].astype(int)
    + zone_day["paykm_drop"].astype(int)
    + zone_day["surge_drop"].astype(int)
) >= 2

# -------------------------
# 7. Adaptation severity (for UI)
# -------------------------
zone_day["adaptation_severity"] = (
    ((zone_day["baseline_fare"] - zone_day["total_fare"])
     / zone_day["baseline_fare"]).clip(lower=0)
    +
    ((zone_day["baseline_surge"] - zone_day["surge_multiplier"])
     / zone_day["baseline_surge"]).clip(lower=0)
)

# -------------------------
# 8. Output detected events
# -------------------------
adaptation_events = zone_day[
    zone_day["platform_adaptation"]
][[
    "pickup_zone",
    "date",
    "total_fare",
    "pay_per_km",
    "surge_multiplier",
    "adaptation_severity"
]]

print("\nPLATFORM ADAPTATION EVENTS:\n")
print(adaptation_events.head(10))



PLATFORM ADAPTATION EVENTS:

Empty DataFrame
Columns: [pickup_zone, date, total_fare, pay_per_km, surge_multiplier, adaptation_severity]
Index: []
