In [None]:
# =====================
# Imports
# =====================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import joblib
import os
from datetime import datetime

sns.set(style="whitegrid")

# =====================
# 1. Train & Save Model
# =====================
def train_model():
    print("üì• Loading dataset...")
    df = pd.read_excel("mobileapp.xlsx")

    # Handle missing data
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    categorical_cols = ['Weather_Type', 'Track_Type']
    for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])

    # Prepare features
    target = "NewSprinterTime"
    y = df[target]
    X = df.drop(columns=[target])
    X = pd.get_dummies(X, columns=categorical_cols, drop_first=False)

    # Convert all to numeric
    X = X.apply(pd.to_numeric)

    # Scale SP_ columns
    sp_cols = [col for col in X.columns if col.startswith("SP_")]
    scaler = StandardScaler()
    X[sp_cols] = scaler.fit_transform(X[sp_cols])
    joblib.dump(scaler, "sp_scaler.pkl")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Train Random Forest
    print("üèãÔ∏è Training Random Forest model...")
    model = RandomForestRegressor(
        n_estimators=800,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    # Save model + columns
    joblib.dump(model, "random_forest_model.pkl")
    joblib.dump(X.columns.tolist(), "feature_columns.pkl")
    print("üíæ Model, scaler, and feature columns saved successfully!")


# =====================
# 2. Prediction + Comparison Plot
# =====================
def predict(sprinter_name, today_time, weather_type, track_type, target_date_str):
    # Ensure trained components exist
    for f in ["random_forest_model.pkl", "feature_columns.pkl", "sp_scaler.pkl"]:
        if not os.path.exists(f):
            print(f"‚ö†Ô∏è {f} not found ‚Äî training model...")
            train_model()

    # Load model assets
    model = joblib.load("random_forest_model.pkl")
    feature_cols = joblib.load("feature_columns.pkl")
    scaler = joblib.load("sp_scaler.pkl")

    # Build input row
    new_row = {col: 0 for col in feature_cols}
    for col in feature_cols:
        if col.startswith("SP_"):
            new_row[col] = today_time

    # Set categorical variables
    weather_col = f"Weather_Type_{weather_type}"
    track_col = f"Track_Type_{track_type}"
    if weather_col in feature_cols:
        new_row[weather_col] = 1
    if track_col in feature_cols:
        new_row[track_col] = 1

    new_df = pd.DataFrame([new_row])[feature_cols]
    sp_cols = [col for col in new_df.columns if col.startswith("SP_")]
    new_df[sp_cols] = scaler.transform(new_df[sp_cols])

    # Predict
    pred_time = float(model.predict(new_df)[0])

    # Horizon-weighted adjustment
    tgt = datetime.strptime(target_date_str, "%Y-%m-%d")
    td = datetime.now()
    horizon_days = (tgt - td).days
    w_model = np.clip(horizon_days / 365, 0, 1)
    w_today = 1 - w_model
    adjusted_time = w_model * pred_time + w_today * today_time

    benchmark = 13.0
    gap = adjusted_time - benchmark
    prob = 1 / (1 + np.exp(5 * gap))

    # Determine verdict based on probability
    if prob < 0.001:
        verdict = "üîß No Chance"
    elif prob >= 0.75:
        verdict = "üèÖ Likely Winner"
    elif prob >= 0.3:
        verdict = "ü•à Top 3 Potential"
    else:
        verdict = "üîß Needs Improvement"

    # Summary
    print("\n===== üèÉ PREDICTION SUMMARY =====")
    print(f"Sprinter Name       : {sprinter_name}")
    print(f"Today's Performance : {today_time:.2f}s")
    print(f"Adjusted (Weighted) : {adjusted_time:.2f}s")
    print(f"Benchmark           : {benchmark:.2f}s")
    print(f"Gap                 : {gap:.2f}s")
    print(f"Horizon Days        : {horizon_days}")
    print(f"Win Probability     : {prob*100:.1f}%")
    print(f"Verdict             : {verdict}")

    # Comparison Plot
    plt.figure(figsize=(9, 5))
    times = [today_time, adjusted_time]
    labels = ["Today", "Adjusted"]
    colors = ["#1f77b4", "#2ca02c"]

    bars = plt.bar(labels, times, color=colors, edgecolor="black", alpha=0.8)
    plt.axhline(y=benchmark, color="red", linestyle="--", linewidth=2, label=f"Benchmark ({benchmark:.2f}s)")

    for bar, val in zip(bars, times):
        plt.text(bar.get_x() + bar.get_width()/2, val + 0.05,
                 f"{val:.2f}s", ha='center', va='bottom', fontsize=10, fontweight='bold')

    for i, val in enumerate(times):
        gap_val = val - benchmark
        direction = "faster" if gap_val < 0 else "slower"
        color = "green" if gap_val < 0 else "red"
        plt.text(i, benchmark + 0.25 * np.sign(gap_val),
                 f"{abs(gap_val):.2f}s {direction}", color=color,
                 ha='center', fontsize=9, fontweight='bold')

    plt.title(f"Performance Comparison: {sprinter_name}", fontsize=14, fontweight='bold')
    plt.ylabel("Time (seconds)")
    plt.legend()
    plt.grid(True, axis='y', linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()


# =====================
# 3. Run
# =====================
if __name__ == "__main__":
    if not os.path.exists("random_forest_model.pkl"):
        train_model()

    print("üèÉ Enter Sprinter Details Below:")
    name = input("Sprinter Name: ")
    time = float(input("Today's Performance (seconds): "))
    weather = input("Weather Type (Sunny/Rainy/Humid/Mixed): ")
    track = input("Track Type (Grass/Synthetic): ")
    date = input("Target Race Date (YYYY-MM-DD): ")

    predict(name, time, weather, track, date)


üèÉ Enter Sprinter Details Below:


Sprinter Name:  a
Today's Performance (seconds):  12


üì• Loading dataset...
üèãÔ∏è Training Random Forest model...
üíæ Model, scaler, and feature columns saved successfully!
‚úÖ All PKL files regenerated and saved in './model_files/' folder.


In [47]:
# Convert all to numeric and replace NaNs
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
model = RandomForestRegressor(n_estimators=800, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Save model and features
joblib.dump(model, "model_files/random_forest_model.pkl")
joblib.dump(X.columns.tolist(), "model_files/feature_columns.pkl")
joblib.dump(scaler, "model_files/sp_scaler.pkl")

# SHAP Explainer
X_train_numeric = X_train.astype(float)
explainer = shap.Explainer(model, X_train_numeric)
joblib.dump(explainer, "model_files/shap_explainer.pkl")


['model_files/shap_explainer.pkl']