In [57]:
!pip install pandas numpy scikit-learn xgboost gradio --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gradio as gr
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [58]:
from google.colab import files
uploaded = files.upload()

Saving train.zip to train (2).zip


In [64]:
# Load dataset
df = pd.read_csv("train.zip", compression="zip", low_memory=False)
df = df.dropna(subset=["Yards"])

# Convert height to inches
def convert_height(h):
    try:
        feet, inches = map(int, h.split('-'))
        return feet * 12 + inches
    except:
        return np.nan

df["PlayerHeight"] = df["PlayerHeight"].apply(convert_height)
df["PlayerBirthDate"] = pd.to_datetime(df["PlayerBirthDate"], errors='coerce')
df["Age"] = df["Season"] - df["PlayerBirthDate"].dt.year

# Select relevant columns
features = [
    "PlayerHeight", "PlayerWeight", "Age", "Distance", "DefendersInTheBox",
    "StadiumType", "Turf", "PlayDirection", "OffenseFormation",
    "Temperature", "Humidity", "WindSpeed"
]

df = df[features + ["Yards"]].dropna()

print("Columns:", df.columns.tolist())

Columns: ['PlayerHeight', 'PlayerWeight', 'Age', 'Distance', 'DefendersInTheBox', 'StadiumType', 'Turf', 'PlayDirection', 'OffenseFormation', 'Temperature', 'Humidity', 'WindSpeed', 'Yards']


In [65]:
# Fill missing numeric values
df["DefendersInTheBox"] = df["DefendersInTheBox"].fillna(df["DefendersInTheBox"].median())
df["Temperature"] = df["Temperature"].fillna(df["Temperature"].median())
df["Humidity"] = df["Humidity"].fillna(df["Humidity"].median())

# Clean categorical features
df["StadiumType"] = df["StadiumType"].fillna("Unknown")
df["Turf"] = df["Turf"].fillna("Unknown")

def clean_stadium_type(val):
    val = str(val).strip().lower()
    if "outdoor" in val or "outdoors" in val: return "Outdoor"
    elif "indoor" in val or "indoors" in val or "dome" in val: return "Indoor"
    elif "retractable" in val: return "Retractable"
    elif "bowl" in val: return "Bowl"
    else: return "Unknown"

def clean_turf(val):
    val = str(val).strip().lower()
    if "grass" in val or "natural" in val: return "Natural Grass"
    elif "artificial" in val or "synthetic" in val: return "Artificial"
    elif "fieldturf" in val: return "Field Turf"
    elif "twenty four/seven" in val: return "Twenty Four/Seven Turf"
    else: return "Unknown"

def clean_play_direction(val):
    val = str(val).strip().lower()
    if val in ["left", "right"]: return val.capitalize()
    return "Unknown"

def clean_formation(val):
    val = str(val).strip().lower()
    if "shotgun" in val: return "Shotgun"
    elif "i-form" in val: return "I-Form"
    elif "singleback" in val: return "Singleback"
    elif "ace" in val: return "Ace"
    elif "pistol" in val: return "Pistol"
    elif "wildcat" in val: return "Wildcat"
    elif "jumbo" in val: return "Jumbo"
    else: return "Other"

df["StadiumType"] = df["StadiumType"].apply(clean_stadium_type)
df["Turf"] = df["Turf"].apply(clean_turf)
df["PlayDirection"] = df["PlayDirection"].apply(clean_play_direction)
df["OffenseFormation"] = df["OffenseFormation"].apply(clean_formation)

print("Cleaning done!")

Cleaning done!


In [66]:
# One-hot encode categorical features
categorical = df.select_dtypes(include="object").columns.tolist()
df = pd.get_dummies(df, columns=categorical)

X = df.drop(columns=["Yards"])
y = df["Yards"]

In [67]:
# Train/test split
X = df.drop(columns=["Yards"])
y = df["Yards"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
lr = LinearRegression().fit(X_train, y_train)
print("Linear Regression model trained.")

rf = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_train, y_train)
print("Random Forest model trained.")

xgb = XGBRegressor(n_estimators=100, random_state=42, verbosity=0).fit(X_train, y_train)
print("XGBoost model trained.")

Linear Regression model trained.
Random Forest model trained.
XGBoost model trained.


In [68]:
# Extract dropdown options from one-hot encoded columns
stadium_types = sorted([col.split('_', 1)[1] for col in X.columns if col.startswith("StadiumType_")])
turfs = sorted([col.split('_', 1)[1] for col in X.columns if col.startswith("Turf_")])
play_directions = sorted([col.split('_', 1)[1] for col in X.columns if col.startswith("PlayDirection_")])
formations = sorted([col.split('_', 1)[1] for col in X.columns if col.startswith("OffenseFormation_")])
wind_dirs = sorted([col.split('_', 1)[1] for col in X.columns if col.startswith("WindDirSimple_")])

# Gradio prediction function
def predict_yards(model_name, PlayerHeight, PlayerWeight, Age, Distance, DefendersInTheBox,
                  StadiumType, Turf, PlayDirection, FormationType,
                  Temperature, Humidity, WindSpeed, WindDirSimple):

    input_dict = {
        "PlayerHeight": convert_height(PlayerHeight),
        "PlayerWeight": PlayerWeight,
        "Age": Age,
        "Distance": Distance,
        "DefendersInTheBox": DefendersInTheBox,
        "Temperature": Temperature,
        "Humidity": Humidity,
        "WindSpeed": WindSpeed
    }

    for col in X.columns:
        if col not in input_dict:
            input_dict[col] = 0

    selected_cats = {
        "StadiumType": StadiumType,
        "Turf": Turf,
        "PlayDirection": PlayDirection,
        "OffenseFormation": FormationType
    }

    for cat, val in selected_cats.items():
        col_name = f"{cat}_{val}"
        if col_name in X.columns:
            input_dict[col_name] = 1

    try:
        input_df = pd.DataFrame([input_dict])[X.columns]
        model = {"Linear Regression": lr, "Random Forest": rf, "XGBoost": xgb}[model_name]
        pred = model.predict(input_df)[0]

        preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        mae = mean_absolute_error(y_test, preds)
        r2 = r2_score(y_test, preds)
        within1 = np.mean(np.abs(y_test - preds) <= 1)
        within2 = np.mean(np.abs(y_test - preds) <= 2)
        within4 = np.mean(np.abs(y_test - preds) <= 4)

        metrics = (
            f"RMSE: {rmse:.2f}\n"
            f"MAE: {mae:.2f}\n"
            f"R²: {r2:.2f}\n\n"
            f"±1 yd: {within1*100:.1f}%\n"
            f"±2 yd: {within2*100:.1f}%\n"
            f"±4 yd: {within4*100:.1f}%"
        )

        fig1, ax1 = plt.subplots(figsize=(5, 4))
        ax1.scatter(y_test, preds, alpha=0.3)
        ax1.set_xlabel("Actual Yards")
        ax1.set_ylabel("Predicted Yards")
        ax1.set_title(f"{model_name} — Predicted vs Actual")
        plt.tight_layout()

        fig2, ax2 = plt.subplots(figsize=(5, 4))
        bars = [within1 * 100, within2 * 100, within4 * 100]
        labels = ["±1 yd", "±2 yd", "±4 yd"]
        ax2.bar(labels, bars, color="skyblue")
        ax2.set_ylim(0, 100)
        ax2.set_ylabel("Accuracy (%)")
        ax2.set_title(f"{model_name} — Domain Metrics")
        plt.tight_layout()

        return round(pred, 2), metrics, fig1, fig2

    except Exception as e:
        return "Error", f"Prediction failed: {e}", None, None

# Launch Gradio interface
gr.Interface(
    fn=predict_yards,
    inputs=[
        gr.Dropdown(["Linear Regression", "Random Forest", "XGBoost"], label="Choose Model"),
        gr.Textbox(label="Player Height (e.g., 6-2)"),
        gr.Number(label="Player Weight (lbs)"),
        gr.Number(label="Player Age"),
        gr.Number(label="Distance to First Down"),
        gr.Number(label="Defenders in the Box"),
        gr.Dropdown(stadium_types, label="Stadium Type"),
        gr.Dropdown(turfs, label="Turf Type"),
        gr.Dropdown(play_directions, label="Play Direction"),
        gr.Dropdown(formations, label="Offense Formation"),
        gr.Number(label="Temperature (°F)"),
        gr.Number(label="Humidity (%)"),
        gr.Number(label="Wind Speed (mph)")
    ],
    outputs=[
        gr.Number(label="Predicted Yards Gained"),
        gr.Textbox(label="Model Evaluation Metrics", lines=6, max_lines=10),
        gr.Plot(label="Predicted vs Actual Yards"),
        gr.Plot(label="Domain Accuracy Metrics")
    ],
    title="🏈 NFL Rushing Yard Predictor",
    description="Predict rushing yards using player, formation, and weather metadata. Visualize model performance with domain-specific metrics."
).launch()



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://98213282f082909fcb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


