In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("/Users/prajitbaskaran/Downloads/delivery_with_synthetic_pay.csv")

# -----------------------------
# 1. Drop non-explainable columns
# -----------------------------
drop_cols = [
    "Order_ID",
    "Delivery_person_ID",
    "Restaurant_latitude",
    "Restaurant_longitude",
    "Delivery_location_latitude",
    "Delivery_location_longitude",
    "order_time"
]

df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# -----------------------------
# 2. Handle categorical columns
# -----------------------------
categorical_cols = df.select_dtypes(include=["object"]).columns

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# -----------------------------
# 3. Handle missing values
# -----------------------------
# Fill numeric columns with median
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# -----------------------------
# 4. Separate features and target
# -----------------------------
TARGET = "pay_received"

X = df.drop(columns=[TARGET])
y = df[TARGET]

print("Preprocessing complete.")
print("Shape of X:", X.shape)
print("Target:", TARGET)


Preprocessing complete.
Shape of X: (45593, 24)
Target: pay_received


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load dataset
input_path = "/Users/prajitbaskaran/Downloads/delivery_with_synthetic_pay.csv"
output_path = "/Users/prajitbaskaran/Downloads/Preprocessed.csv"

df = pd.read_csv(input_path)

# -----------------------------
# 1. Drop non-explainable columns
# -----------------------------
drop_cols = [
    "Order_ID",
    "Delivery_person_ID",
    "Restaurant_latitude",
    "Restaurant_longitude",
    "Delivery_location_latitude",
    "Delivery_location_longitude",
    "order_time"
]

df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# -----------------------------
# 2. Encode categorical columns
# -----------------------------
categorical_cols = df.select_dtypes(include=["object"]).columns

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# -----------------------------
# 3. Handle missing values
# -----------------------------
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# -----------------------------
# 4. Save preprocessed dataset
# -----------------------------
df.to_csv(output_path, index=False)

print("‚úÖ Preprocessing complete.")
print("üìÅ Saved to:", output_path)
print("Final dataset shape:", df.shape)


‚úÖ Preprocessing complete.
üìÅ Saved to: /Users/prajitbaskaran/Downloads/Preprocessed.csv
Final dataset shape: (45593, 25)


In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# =============================
# 1. Load ORIGINAL dataset
# =============================
input_path = "/Users/prajitbaskaran/Downloads/delivery_with_synthetic_pay.csv"
output_path = "/Users/prajitbaskaran/Downloads/2delivery_with_synthetic_pay.csv"

df = pd.read_csv(input_path)

# =============================
# 2. Drop non-interpretable / ID columns
# =============================
drop_cols = [
    "ID",
    "Order_ID",
    "Delivery_person_ID",
    "Restaurant_latitude",
    "Restaurant_longitude",
    "Delivery_location_latitude",
    "Delivery_location_longitude",
    "order_time",
    "Order_Date",
    "Time_Ordered",
    "Time_Order_picked"
]

df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# =============================
# 3. Ensure correct numeric types
# =============================
numeric_cols_force = [
    "Delivery_person_Age",
    "Delivery_person_Ratings",
    "distance_km",
    "hour",
    "Time_taken(min)"
]

for col in numeric_cols_force:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# =============================
# 4. Convert boolean to int
# =============================
if "is_peak_hour" in df.columns:
    df["is_peak_hour"] = df["is_peak_hour"].astype(int)

# =============================
# 5. Explicit categorical encoding
# =============================
categorical_cols = [
    "City",
    "Weatherconditions",
    "Road_traffic_density",
    "Type_of_order",
    "Type_of_vehicle",
    "Vehicle_condition",
    "Festival"
]

label_encoders = {}
for col in categorical_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

# =============================
# 6. Handle missing values
# =============================
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# =============================
# 7. Save preprocessed dataset
# =============================
df.to_csv(output_path, index=False)

print("‚úÖ FINAL preprocessing complete")
print("üìÅ Saved to:", output_path)
print("Final shape:", df.shape)
print("\nAge sanity check:")
print(df["Delivery_person_Age"].describe())


‚úÖ FINAL preprocessing complete
üìÅ Saved to: /Users/prajitbaskaran/Downloads/2delivery_with_synthetic_pay.csv
Final shape: (45593, 22)

Age sanity check:
count    45593.000000
mean        29.584739
std          5.696333
min         15.000000
25%         25.000000
50%         30.000000
75%         34.000000
max         50.000000
Name: Delivery_person_Age, dtype: float64


In [None]:
input_path = "/Users/prajitbaskaran/Downloads/delivery_with_synthetic_pay.csv"
output_path = "/Users/prajitbaskaran/Downloads/2delivery_with_synthetic_pay.csv"

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# =============================
# 1. Load ORIGINAL dataset
# =============================
input_path = "/Users/prajitbaskaran/Downloads/delivery_with_synthetic_pay.csv"
output_path = "/Users/prajitbaskaran/Downloads/3delivery_with_synthetic_pay.csv"

df = pd.read_csv(input_path)

# =============================
# 2. FIX TIME PROPERLY
# =============================
# Convert Time_Orderd to datetime and extract hour
if "Time_Orderd" in df.columns:
    df["Time_Orderd"] = pd.to_datetime(df["Time_Orderd"], format="%H:%M:%S", errors="coerce")
    df["hour"] = df["Time_Orderd"].dt.hour

# =============================
# 3. Drop non-interpretable / ID / raw time columns
# =============================
drop_cols = [
    "ID",
    "Order_ID",
    "Delivery_person_ID",
    "Restaurant_latitude",
    "Restaurant_longitude",
    "Delivery_location_latitude",
    "Delivery_location_longitude",
    "order_time",
    "Order_Date",
    "Time_Ordered",
    "Time_Order_picked",
    "Time_Orderd"   # raw time dropped AFTER hour extraction
]

df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# =============================
# 4. FORCE correct numeric types
# =============================
numeric_cols_force = [
    "Delivery_person_Age",
    "Delivery_person_Ratings",
    "distance_km",
    "hour",
    "Time_taken(min)",
    "surge_bonus",
    "effort_bonus",
    "rating_penalty",
    "policy_multiplier",
    "pay_received"
]

for col in numeric_cols_force:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# =============================
# 5. Convert boolean flags
# =============================
if "is_peak_hour" in df.columns:
    df["is_peak_hour"] = df["is_peak_hour"].astype(int)

# =============================
# 6. Encode ONLY true categorical columns
# =============================
categorical_cols = [
    "City",
    "Weatherconditions",
    "Road_traffic_density",
    "Type_of_order",
    "Type_of_vehicle",
    "Vehicle_condition",
    "Festival"
]

label_encoders = {}
for col in categorical_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

# =============================
# 7. Handle missing values
# =============================
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# =============================
# 8. CAP EXTREME PAY OUTLIERS (CRITICAL FIX)
# =============================
if "pay_received" in df.columns:
    upper_cap = df["pay_received"].quantile(0.99)
    df["pay_received"] = df["pay_received"].clip(upper=upper_cap)

# =============================
# 9. Save CLEAN dataset
# =============================
df.to_csv(output_path, index=False)

print("‚úÖ FINAL preprocessing complete")
print("üìÅ Saved to:", output_path)
print("Final shape:", df.shape)

print("\nüîé Sanity checks:")
print("Age range:")
print(df["Delivery_person_Age"].describe())

print("\nHour distribution:")
print(df["hour"].value_counts().head())

print("\nPay stats:")
print(df["pay_received"].describe())


‚úÖ FINAL preprocessing complete
üìÅ Saved to: /Users/prajitbaskaran/Downloads/3delivery_with_synthetic_pay.csv
Final shape: (45593, 21)

üîé Sanity checks:
Age range:
count    45593.000000
mean        29.584739
std          5.696333
min         15.000000
25%         25.000000
50%         30.000000
75%         34.000000
max         50.000000
Name: Delivery_person_Age, dtype: float64

Hour distribution:
hour
19.0    6326
21.0    4686
22.0    4576
20.0    4539
23.0    4511
Name: count, dtype: int64

Pay stats:
count    45593.000000
mean       148.479375
std         61.600237
min         43.630000
25%         94.940000
50%        142.830000
75%        191.090000
max        302.650000
Name: pay_received, dtype: float64


In [5]:
# ===============================
# SHAP-GUIDED RL FEATURE SELECTION
# ONE-CELL PIPELINE
# ===============================

import numpy as np
import pandas as pd
import shap
import warnings

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")
np.random.seed(42)

# -------- CONFIG --------
INPUT_PATH = "/Users/prajitbaskaran/Downloads/3delivery_with_synthetic_pay.csv"
OUTPUT_PATH = "FeatureSelected_FINAL.csv"
TARGET = "pay_received"

N_EPISODES = 30        # RL iterations (keep small for speed)
MAX_FLIPS = 2          # PPO-style constrained actions
SUBSAMPLE_FRAC = 0.25  # speed-up

# -------- LOAD DATA --------
df = pd.read_csv(INPUT_PATH)
X_full = df.drop(columns=[TARGET])
y_full = df[TARGET]

feature_names = X_full.columns.tolist()
N_FEATURES = len(feature_names)

# Subsample for RL evaluation
X_sub, _, y_sub, _ = train_test_split(
    X_full, y_full, train_size=SUBSAMPLE_FRAC, random_state=42
)

# -------- ENVIRONMENT --------
def evaluate_subset(mask):
    selected = [f for f, m in zip(feature_names, mask) if m == 1]

    if len(selected) < 4:
        return -1e6

    X_sel = X_sub[selected]

    X_tr, X_te, y_tr, y_te = train_test_split(
        X_sel, y_sub, test_size=0.2, random_state=42
    )

    model = GradientBoostingRegressor(
        n_estimators=80,
        max_depth=3,
        learning_rate=0.1,
        random_state=42
    )

    model.fit(X_tr, y_tr)
    preds = model.predict(X_te)
    rmse = np.sqrt(mean_squared_error(y_te, preds))

    explainer = shap.TreeExplainer(model)
    shap_vals = explainer.shap_values(X_tr, check_additivity=False)

    shap_var = np.mean(np.var(np.abs(shap_vals), axis=0))

    # Reward: accuracy + SHAP stability + sparsity
    reward = (
        - rmse
        - 0.4 * shap_var
        - 0.03 * len(selected)
    )

    return reward

# -------- PPO-INSPIRED RL LOOP --------
current_mask = np.random.randint(0, 2, N_FEATURES)
best_mask = current_mask.copy()
best_reward = -np.inf

for ep in range(N_EPISODES):

    candidate = current_mask.copy()
    flip_idx = np.random.choice(
        N_FEATURES,
        size=np.random.randint(1, MAX_FLIPS + 1),
        replace=False
    )
    candidate[flip_idx] = 1 - candidate[flip_idx]

    reward = evaluate_subset(candidate)

    if reward > best_reward:
        best_reward = reward
        best_mask = candidate.copy()
        current_mask = candidate.copy()

    print(f"Episode {ep:02d} | Reward={reward:.4f} | Features={candidate.sum()}")

# -------- FINAL FEATURE SET --------
selected_features = [
    f for f, m in zip(feature_names, best_mask) if m == 1
]

print("\n‚úÖ FINAL SELECTED FEATURES:")
for f in selected_features:
    print("-", f)

# -------- SAVE FINAL DATASET --------
final_df = df[selected_features + [TARGET]]
final_df.to_csv(OUTPUT_PATH, index=False)

print("\nüìÅ Saved feature-selected dataset to:", OUTPUT_PATH)
print("Final shape:", final_df.shape)


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


ValueError: Input X contains NaN.
GradientBoostingRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [8]:
import pandas as pd
import numpy as np

# ===============================
# CONFIG
# ===============================
INPUT_PATH = "/Users/prajitbaskaran/Downloads/delivery_with_synthetic_pay.csv"
OUTPUT_PATH = "/Users/prajitbaskaran/Downloads/finalig_v01.csv"
TARGET = "pay_received"

# ===============================
# LOAD DATA
# ===============================
df = pd.read_csv(INPUT_PATH)

# ===============================
# 1. DROP UNWANTED / DANGEROUS COLUMNS
# ===============================
drop_cols = [
    # IDs
    "ID", "Order_ID", "Delivery_person_ID",

    # Exact locations (not explainable)
    "Restaurant_latitude", "Restaurant_longitude",
    "Delivery_location_latitude", "Delivery_location_longitude",

    # Raw timestamps (will derive hour instead)
    "order_time", "Order_Date",
    "Time_Ordered", "Time_Order_picked"
]

df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# ===============================
# 2. HANDLE TIME PROPERLY
# ===============================
# Extract hour from Time_Orderd if present
if "Time_Orderd" in df.columns:
    df["Time_Orderd"] = pd.to_datetime(
        df["Time_Orderd"], format="%H:%M:%S", errors="coerce"
    )
    df["hour"] = df["Time_Orderd"].dt.hour
    df = df.drop(columns=["Time_Orderd"])

# ===============================
# 3. FIX DATA TYPES (CRITICAL)
# ===============================
numeric_cols = [
    "Delivery_person_Age",
    "Delivery_person_Ratings",
    "distance_km",
    "hour",
    "Time_taken(min)",
    "surge_bonus",
    "effort_bonus",
    "rating_penalty",
    "policy_multiplier",
    TARGET
]

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# ===============================
# 4. CONVERT BOOLEAN FLAGS
# ===============================
if "is_peak_hour" in df.columns:
    df["is_peak_hour"] = df["is_peak_hour"].astype(int)

# ===============================
# 5. HANDLE MISSING VALUES
# ===============================
# Numeric ‚Üí median
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Categorical ‚Üí mode
cat_cols = df.select_dtypes(include=["object"]).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# ===============================
# 6. REMOVE IMPOSSIBLE VALUES
# ===============================
# Age sanity
if "Delivery_person_Age" in df.columns:
    df = df[(df["Delivery_person_Age"] >= 18) & (df["Delivery_person_Age"] <= 60)]

# Hour sanity
if "hour" in df.columns:
    df = df[(df["hour"] >= 0) & (df["hour"] <= 23)]

# ===============================
# 7. CAP EXTREME PAY OUTLIERS
# ===============================
upper_cap = df[TARGET].quantile(0.99)
df[TARGET] = df[TARGET].clip(upper=upper_cap)

# ===============================
# 8. FINAL SAFETY CHECK
# ===============================
# Remove any remaining non-numeric columns EXCEPT target
non_numeric = df.drop(columns=[TARGET]).select_dtypes(exclude=["int64", "float64"]).columns
df = df.drop(columns=non_numeric)

# ===============================
# 9. SAVE CLEAN DATASET
# ===============================
df.to_csv(OUTPUT_PATH, index=False)

print("‚úÖ PREPROCESSING COMPLETE")
print("Saved to:", OUTPUT_PATH)
print("Final shape:", df.shape)

print("\nSanity checks:")
print("Age range:", df["Delivery_person_Age"].min(), "-", df["Delivery_person_Age"].max())
print("Hour unique:", sorted(df["hour"].unique())[:5], "...")
print("Pay stats:\n", df[TARGET].describe())


‚úÖ PREPROCESSING COMPLETE
Saved to: /Users/prajitbaskaran/Downloads/finalig_v01.csv
Final shape: (45555, 14)

Sanity checks:
Age range: 20.0 - 50.0
Hour unique: [0.0, 8.0, 9.0, 10.0, 11.0] ...
Pay stats:
 count    45555.000000
mean       148.450890
std         61.568836
min         43.630000
25%         94.940000
50%        142.830000
75%        191.090000
max        302.640000
Name: pay_received, dtype: float64


In [None]:
PREPROCESSING

In [24]:
import pandas as pd
import numpy as np

# ===============================
# CONFIG
# ===============================
INPUT_PATH = "/Users/prajitbaskaran/Downloads/last.csv"
OUTPUT_PATH = "/Users/prajitbaskaran/Downloads/last_preprocessed.csv"
TARGET = "pay_received"

# ===============================
# LOAD DATA
# ===============================
df = pd.read_csv(INPUT_PATH)

# ===============================
# 1. DROP UNWANTED / UNSAFE COLUMNS
# ===============================
drop_cols = [
    # IDs
    "ID", "Order_ID", "Delivery_person_ID",

    # Exact locations (not explainable)
    "Restaurant_latitude", "Restaurant_longitude",
    "Delivery_location_latitude", "Delivery_location_longitude",

    # Raw timestamps
    "order_time", "Order_Date",
    "Time_Ordered", "Time_Order_picked"
]

df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# ===============================
# 2. HANDLE TIME (EXTRACT HOUR)
# ===============================
if "Time_Orderd" in df.columns:
    df["Time_Orderd"] = pd.to_datetime(
        df["Time_Orderd"], format="%H:%M:%S", errors="coerce"
    )
    df["hour"] = df["Time_Orderd"].dt.hour
    df = df.drop(columns=["Time_Orderd"])

# ===============================
# 3. FORCE NUMERIC TYPES
# ===============================
numeric_cols = [
    "Delivery_person_Age",
    "Delivery_person_Ratings",
    "distance_km",
    "hour",
    "Time_taken(min)",
    "surge_bonus",
    "effort_bonus",
    "rating_penalty",
    "policy_multiplier",
    TARGET
]

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# ===============================
# 4. BOOLEAN ‚Üí INT
# ===============================
if "is_peak_hour" in df.columns:
    df["is_peak_hour"] = df["is_peak_hour"].astype(int)

# ===============================
# 5. HANDLE MISSING VALUES
# ===============================
# Numeric ‚Üí median
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# ===============================
# 6. REMOVE IMPOSSIBLE VALUES
# ===============================
if "Delivery_person_Age" in df.columns:
    df = df[(df["Delivery_person_Age"] >= 18) & (df["Delivery_person_Age"] <= 60)]

if "hour" in df.columns:
    df = df[(df["hour"] >= 0) & (df["hour"] <= 23)]

# ===============================
# 7. CAP EXTREME PAY OUTLIERS
# ===============================
upper_cap = df[TARGET].quantile(0.99)
df[TARGET] = df[TARGET].clip(upper=upper_cap)

# ===============================
# 8. DROP EMPTY (ALL-NaN) COLUMNS  ‚Üê CRITICAL FIX
# ===============================
empty_cols = [c for c in df.columns if df[c].isna().all()]
df = df.drop(columns=empty_cols)

# ===============================
# 9. FINAL SAFETY: KEEP ONLY NUMERIC FEATURES
# ===============================
non_numeric = df.drop(columns=[TARGET]) \
                .select_dtypes(exclude=["int64", "float64"]).columns
df = df.drop(columns=non_numeric)

# ===============================
# 10. SAVE CLEAN DATASET
# ===============================
df.to_csv(OUTPUT_PATH, index=False)

print("‚úÖ FINAL PREPROCESSING COMPLETE")
print("Saved to:", OUTPUT_PATH)
print("Final shape:", df.shape)

print("\nDropped empty columns:", empty_cols)
print("\nSanity checks:")
print("Age range:", df["Delivery_person_Age"].min(), "-", df["Delivery_person_Age"].max())
print("Hour unique sample:", sorted(df["hour"].unique())[:5])
print("Pay stats:\n", df[TARGET].describe())


KeyError: "['pay_received'] not found in axis"

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# ===============================
# LOAD DATASET
# ===============================
df = pd.read_csv("/Users/prajitbaskaran/Downloads/1finalig_v02.csv")

TARGET = "pay_received"

X = df.drop(columns=[TARGET])
y = df[TARGET]

# ===============================
# TRAIN-TEST SPLIT
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===============================
# TRAIN RANDOM FOREST (SIMPLE)
# ===============================
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

# ===============================
# EVALUATE
# ===============================
y_pred = rf.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("‚úÖ Random Forest R¬≤ score:", round(r2, 4))


‚úÖ Random Forest R¬≤ score: 0.9923


In [14]:
import pandas as pd
import numpy as np

# ===============================
# LOAD DATA
# ===============================
input_path = "/Users/prajitbaskaran/Downloads/finalig_v02.csv"
output_path = "/Users/prajitbaskaran/Downloads/1finalig_v02.csv"

df = pd.read_csv(input_path)

# ===============================
# ADD SMALL GAUSSIAN NOISE
# ===============================
np.random.seed(42)  # reproducibility

noise = np.random.normal(loc=0, scale=5, size=len(df))
df["pay_received"] = df["pay_received"] + noise

# Ensure pay is non-negative
df["pay_received"] = df["pay_received"].clip(lower=0)

# ===============================
# SAVE NEW DATASET
# ===============================
df.to_csv(output_path, index=False)

print("‚úÖ Noise added to pay_received")
print("üìÅ Saved to:", output_path)

print("\nNew pay statistics:")
print(df["pay_received"].describe())


‚úÖ Noise added to pay_received
üìÅ Saved to: /Users/prajitbaskaran/Downloads/1finalig_v02.csv

New pay statistics:
count    45555.000000
mean       148.452278
std         61.778520
min         28.307871
25%         97.193862
50%        142.799249
75%        189.802780
max        315.440423
Name: pay_received, dtype: float64


In [19]:
import pandas as pd
import numpy as np
import shap
import warnings

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")
np.random.seed(42)

# ===============================
# CONFIG
# ===============================
INPUT_PATH = "/Users/prajitbaskaran/Downloads/1finalig_v02.csv"
OUTPUT_PATH = "/Users/prajitbaskaran/Downloads/RL_2__1finalig_v02.csv"
TARGET = "pay_received"

N_EPISODES = 30        # RL iterations
MAX_FLIPS = 2          # PPO-style constrained action
SUBSAMPLE_FRAC = 0.25  # speed-up

# ===============================
# LOAD DATA
# ===============================
df = pd.read_csv(INPUT_PATH)

X_full = df.drop(columns=[TARGET])
y_full = df[TARGET]

feature_names = X_full.columns.tolist()
N_FEATURES = len(feature_names)

# Subsample for RL evaluation
X_sub, _, y_sub, _ = train_test_split(
    X_full, y_full, train_size=SUBSAMPLE_FRAC, random_state=42
)

# ===============================
# ENVIRONMENT (FAST)
# ===============================
def evaluate_subset(mask):
    selected = [f for f, m in zip(feature_names, mask) if m == 1]

    if len(selected) < 4:
        return -1e6

    X_sel = X_sub[selected]

    X_tr, X_te, y_tr, y_te = train_test_split(
        X_sel, y_sub, test_size=0.2, random_state=42
    )

    model = GradientBoostingRegressor(
        n_estimators=80,
        max_depth=3,
        learning_rate=0.1,
        random_state=42
    )

    model.fit(X_tr, y_tr)
    preds = model.predict(X_te)

    rmse = np.sqrt(mean_squared_error(y_te, preds))

    explainer = shap.TreeExplainer(model)
    shap_vals = explainer.shap_values(X_tr, check_additivity=False)

    shap_var = np.mean(np.var(np.abs(shap_vals), axis=0))

    reward = (
        - rmse               # accuracy
        - 0.4 * shap_var     # explanation stability
        - 0.03 * len(selected)  # sparsity
    )

    return reward

# ===============================
# PPO-INSPIRED RL LOOP
# ===============================
current_mask = np.random.randint(0, 2, N_FEATURES)
best_mask = current_mask.copy()
best_reward = -np.inf

print("Starting SHAP-guided RL feature selection...\n")

for ep in range(N_EPISODES):

    candidate = current_mask.copy()
    flip_idx = np.random.choice(
        N_FEATURES,
        size=np.random.randint(1, MAX_FLIPS + 1),
        replace=False
    )
    candidate[flip_idx] = 1 - candidate[flip_idx]

    reward = evaluate_subset(candidate)

    if reward > best_reward:
        best_reward = reward
        best_mask = candidate.copy()
        current_mask = candidate.copy()

    print(
        f"Episode {ep:02d} | "
        f"Reward={reward:.4f} | "
        f"Features={candidate.sum()}"
    )

# ===============================
# FINAL FEATURE SET
# ===============================
selected_features = [
    f for f, m in zip(feature_names, best_mask) if m == 1
]

print("\n‚úÖ FINAL SELECTED FEATURES:")
for f in selected_features:
    print("-", f)

# ===============================
# SAVE FEATURE-SELECTED DATASET
# ===============================
final_df = df[selected_features + [TARGET]]
final_df.to_csv(OUTPUT_PATH, index=False)

print("\nüìÅ Saved RL-selected dataset to:", OUTPUT_PATH)
print("Final shape:", final_df.shape)


Starting SHAP-guided RL feature selection...

Episode 00 | Reward=-66.3749 | Features=4
Episode 01 | Reward=-63.7072 | Features=4
Episode 02 | Reward=-66.3749 | Features=4
Episode 03 | Reward=-63.3595 | Features=5
Episode 04 | Reward=-63.0385 | Features=7
Episode 05 | Reward=-61.7837 | Features=5
Episode 06 | Reward=-63.3765 | Features=5
Episode 07 | Reward=-66.5399 | Features=5
Episode 08 | Reward=-84.2316 | Features=6
Episode 09 | Reward=-99.5228 | Features=5
Episode 10 | Reward=-61.7890 | Features=4
Episode 11 | Reward=-66.5399 | Features=5
Episode 12 | Reward=-61.7890 | Features=4
Episode 13 | Reward=-65.6892 | Features=6
Episode 14 | Reward=-1000000.0000 | Features=3
Episode 15 | Reward=-64.4641 | Features=6
Episode 16 | Reward=-1000000.0000 | Features=3
Episode 17 | Reward=-1000000.0000 | Features=3
Episode 18 | Reward=-1000000.0000 | Features=3
Episode 19 | Reward=-65.7929 | Features=7
Episode 20 | Reward=-61.7890 | Features=4
Episode 21 | Reward=-74.6666 | Features=7
Episode 22

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# ===============================
# LOAD DATASET
# ===============================
df = pd.read_csv("/Users/prajitbaskaran/Downloads/RL_2__1finalig_v02.csv")

TARGET = "pay_received"

X = df.drop(columns=[TARGET])
y = df[TARGET]

# ===============================
# TRAIN-TEST SPLIT
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===============================
# TRAIN RANDOM FOREST (SIMPLE)
# ===============================
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

# ===============================
# EVALUATE
# ===============================
y_pred = rf.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("‚úÖ Random Forest R¬≤ score:", round(r2, 4))


‚úÖ Random Forest R¬≤ score: 0.0004


In [21]:
import pandas as pd
import numpy as np
import shap
import warnings

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")

# ===============================
# CONFIG
# ===============================
INPUT_PATH = "/Users/prajitbaskaran/Downloads/1finalig_v02.csv"
OUTPUT_PATH = "/Users/prajitbaskaran/Downloads/RLV1_1finalig_v02.csv"
TARGET = "pay_received"

MANDATORY_FEATURES = ["distance_km", "hour"]  # hard constraints

K_RUNS = 5
SELECTION_THRESHOLD = 0.6   # 60% of runs
N_EPISODES = 25
MAX_FLIPS = 2
SUBSAMPLE_FRAC = 0.25

np.random.seed(42)

# ===============================
# LOAD DATA
# ===============================
df = pd.read_csv(INPUT_PATH)

X_full = df.drop(columns=[TARGET])
y_full = df[TARGET]

feature_names = X_full.columns.tolist()
N_FEATURES = len(feature_names)

feature_index = {f: i for i, f in enumerate(feature_names)}

# ===============================
# SUBSAMPLE FOR SPEED
# ===============================
X_sub, _, y_sub, _ = train_test_split(
    X_full, y_full, train_size=SUBSAMPLE_FRAC, random_state=42
)

# ===============================
# ENVIRONMENT FUNCTION
# ===============================
def evaluate_subset(mask):
    selected = [f for f, m in zip(feature_names, mask) if m == 1]

    if len(selected) < 4:
        return -1e6

    X_sel = X_sub[selected]

    X_tr, X_te, y_tr, y_te = train_test_split(
        X_sel, y_sub, test_size=0.2, random_state=42
    )

    model = GradientBoostingRegressor(
        n_estimators=80,
        max_depth=3,
        learning_rate=0.1,
        random_state=42
    )

    model.fit(X_tr, y_tr)
    preds = model.predict(X_te)

    rmse = np.sqrt(mean_squared_error(y_te, preds))

    explainer = shap.TreeExplainer(model)
    shap_vals = explainer.shap_values(X_tr, check_additivity=False)

    shap_var = np.mean(np.var(np.abs(shap_vals), axis=0))

    reward = (
        - rmse
        - 0.4 * shap_var
        - 0.02 * len(selected)   # weaker sparsity penalty
    )

    return reward

# ===============================
# RUN RL K TIMES
# ===============================
all_selected_masks = []

print("Running constrained RL feature selection...\n")

for run in range(K_RUNS):
    np.random.seed(42 + run)

    # initialize random mask
    current_mask = np.random.randint(0, 2, N_FEATURES)

    # enforce mandatory features
    for f in MANDATORY_FEATURES:
        if f in feature_index:
            current_mask[feature_index[f]] = 1

    best_mask = current_mask.copy()
    best_reward = -np.inf

    for ep in range(N_EPISODES):
        candidate = current_mask.copy()

        flip_idx = np.random.choice(
            N_FEATURES,
            size=np.random.randint(1, MAX_FLIPS + 1),
            replace=False
        )

        candidate[flip_idx] = 1 - candidate[flip_idx]

        # re-enforce mandatory features
        for f in MANDATORY_FEATURES:
            if f in feature_index:
                candidate[feature_index[f]] = 1

        reward = evaluate_subset(candidate)

        if reward > best_reward:
            best_reward = reward
            best_mask = candidate.copy()
            current_mask = candidate.copy()

    all_selected_masks.append(best_mask)
    print(f"Run {run+1}/{K_RUNS} completed")

# ===============================
# AGGREGATE SELECTIONS
# ===============================
selection_counts = np.sum(all_selected_masks, axis=0) / K_RUNS

stable_features = [
    f for f, freq in zip(feature_names, selection_counts)
    if freq >= SELECTION_THRESHOLD
]

# ensure mandatory features are present
for f in MANDATORY_FEATURES:
    if f not in stable_features and f in feature_names:
        stable_features.append(f)

print("\n‚úÖ STABLE SELECTED FEATURES:")
for f in stable_features:
    print("-", f)

# ===============================
# SAVE FINAL DATASET
# ===============================
final_df = df[stable_features + [TARGET]]
final_df.to_csv(OUTPUT_PATH, index=False)

print("\nüìÅ Saved stable RL-selected dataset to:", OUTPUT_PATH)
print("Final shape:", final_df.shape)


Running constrained RL feature selection...

Run 1/5 completed
Run 2/5 completed
Run 3/5 completed
Run 4/5 completed
Run 5/5 completed

‚úÖ STABLE SELECTED FEATURES:
- Delivery_person_Age
- Delivery_person_Ratings
- Vehicle_condition
- distance_km
- hour
- is_peak_hour
- weather_severity
- traffic_severity
- surge_bonus
- effort_bonus
- rating_penalty
- policy_multiplier

üìÅ Saved stable RL-selected dataset to: /Users/prajitbaskaran/Downloads/RLV1_1finalig_v02.csv
Final shape: (45555, 13)


In [22]:
import pandas as pd
import numpy as np
import shap
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

# ===============================
# LOAD DATA
# ===============================
df = pd.read_csv("/Users/prajitbaskaran/Downloads/RLV1_1finalig_v02.csv")
TARGET = "pay_received"

X = df.drop(columns=[TARGET])
y = df[TARGET]

# ===============================
# LOAD OR TRAIN MODEL
# ===============================
try:
    model = joblib.load("final_pay_model.pkl")
    explainer = joblib.load("final_shap_explainer.pkl")
    print("‚úÖ Loaded saved model & explainer")
except:
    print("‚ö†Ô∏è Saved model not found ‚Äî training quickly")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = GradientBoostingRegressor(
        n_estimators=200,
        max_depth=3,
        learning_rate=0.05,
        random_state=42
    )

    model.fit(X_train, y_train)
    explainer = shap.TreeExplainer(model)

# ===============================
# PICK ONE DELIVERY TO EXPLAIN
# ===============================
idx = 0  # change this to explain any delivery

x_instance = X.iloc[[idx]]
actual_pay = y.iloc[idx]
predicted_pay = model.predict(x_instance)[0]

# ===============================
# COMPUTE SHAP VALUES
# ===============================
shap_values = explainer.shap_values(x_instance)[0]
base_value = explainer.expected_value

# ===============================
# BUILD PLAIN-ENGLISH EXPLANATION
# ===============================
contributions = list(zip(X.columns, shap_values))
contributions = sorted(contributions, key=lambda x: abs(x[1]), reverse=True)

print("\nüßæ PAY EXPLANATION (GLASS BOX)")
print("--------------------------------------------------")
print(f"Predicted Pay : ‚Çπ{predicted_pay:.2f}")
print(f"Actual Pay    : ‚Çπ{actual_pay:.2f}")
print(f"Base Pay      : ‚Çπ{base_value:.2f}\n")

print("üîç How each factor affected your pay:\n")

for feature, value in contributions:
    if abs(value) < 0.5:
        continue  # ignore tiny effects

    if value > 0:
        print(f"‚ûï {feature} increased your pay by ‚Çπ{value:.2f}")
    else:
        print(f"‚ûñ {feature} reduced your pay by ‚Çπ{abs(value):.2f}")

print("\nüßÆ Final Explanation:")
print(
    f"Starting from a base pay of ‚Çπ{base_value:.2f}, "
    f"the above factors were added or subtracted to arrive at "
    f"a final predicted pay of ‚Çπ{predicted_pay:.2f}."
)

# ===============================
# OPTIONAL: VISUAL FORCE PLOT
# ===============================
shap.force_plot(
    base_value,
    shap_values,
    x_instance,
    matplotlib=True
)


‚ö†Ô∏è Saved model not found ‚Äî training quickly

üßæ PAY EXPLANATION (GLASS BOX)
--------------------------------------------------
Predicted Pay : ‚Çπ77.58
Actual Pay    : ‚Çπ80.19


TypeError: unsupported format string passed to numpy.ndarray.__format__