In [None]:
!pip install pandas numpy scikit-learn matplotlib seaborn lightgbm category_encoders kaggle --quiet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import lightgbm as lgb
import category_encoders as ce

In [None]:
# STEP 1 — Upload dataset

from google.colab import files
files.upload()

In [None]:
# STEP 2 — Load and inspect data

df = pd.read_csv("train.csv", low_memory=False)
print("Shape:", df.shape)
df.head()

# Keep relevant metadata columns
cols = [
    "GameId", "PlayId", "Team", "YardLine", "Quarter", "Down", "Distance",
    "DefendersInTheBox", "Temperature", "Humidity", "WindSpeed", "PlayDirection",
    "HomeTeamAbbr", "VisitorTeamAbbr", "StadiumType", "Turf", "Yards"
]
df = df[cols]

print("Columns:", df.columns.tolist())

In [None]:
# STEP 3 — Data Cleaning (Fixed)

# 1. Clean WindSpeed column
def clean_windspeed(val):
    if pd.isna(val):
        return np.nan
    val = str(val).lower().replace("mph", "").strip()
    # Handle cases like "8.0" or "8 - 10"
    if "-" in val:
        parts = val.split("-")
        try:
            nums = [float(p.strip()) for p in parts if p.strip().replace('.', '', 1).isdigit()]
            return np.mean(nums)
        except:
            return np.nan
    # Handle numeric values
    try:
        return float(val)
    except:
        return np.nan

df["WindSpeed"] = df["WindSpeed"].apply(clean_windspeed)

# 2. Fill missing numeric values
df["DefendersInTheBox"] = df["DefendersInTheBox"].fillna(df["DefendersInTheBox"].median())
df["Temperature"] = df["Temperature"].fillna(df["Temperature"].median())
df["Humidity"] = df["Humidity"].fillna(df["Humidity"].median())
df["WindSpeed"] = df["WindSpeed"].fillna(df["WindSpeed"].median())

# 3. Fill missing categorical values
df["StadiumType"] = df["StadiumType"].fillna("Unknown")
df["Turf"] = df["Turf"].fillna("Unknown")

# 4. Drop any rows with missing target (Yards)
df = df.dropna(subset=["Yards"])

print("Cleaning done!")
print(df[["Temperature", "Humidity", "WindSpeed"]].describe())


In [None]:
# STEP 4 — Feature Engineering

# Categorical cleanup
# Simplify categorical columns
cat_cols = ["Team", "PlayDirection", "HomeTeamAbbr", "VisitorTeamAbbr", "StadiumType", "Turf"]

# Derived features
df["IsGoalToGo"] = (df["YardLine"] <= 10).astype(int)
df["IsHomeTeam"] = (df["Team"] == df["HomeTeamAbbr"]).astype(int)
df["IsOutdoor"] = df["StadiumType"].str.contains("Out", case=False).astype(int)
df["IsGrass"] = df["Turf"].str.contains("Grass", case=False).astype(int)

# Simplify weather effects
def temp_bucket(temp):
    if temp < 40: return "Cold"
    elif temp < 70: return "Mild"
    else: return "Hot"
df["TempBucket"] = df["Temperature"].apply(temp_bucket)

# Drop unneeded columns
df = df.drop(["GameId", "PlayId"], axis=1)

# Encode categorical features
encoder = ce.OrdinalEncoder(cols=cat_cols + ["TempBucket"])
df = encoder.fit_transform(df)

In [None]:
# STEP 5 — Exploratory Analysis

plt.figure(figsize=(6,4))
sns.histplot(df["Yards"], bins=30, kde=True)
plt.title("Distribution of Yards Gained")
plt.show()

In [None]:
# STEP 6 — Prepare Data for Modeling

X = df.drop("Yards", axis=1)
y = df["Yards"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# STEP 7 — Train Regression Models

# Linear Regression
lin_model = LinearRegression()
lin_model.fit(X_train_scaled, y_train)
print("Linear Regression model trained.")

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
print("Random Forest model trained.")

# LightGBM
lgb_model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    random_state=42,
    verbose=-1  # Suppresses all training logs
)
lgb_model.fit(X_train, y_train)
print("LightGBM model trained.")

In [None]:
# STEP 8 — Model Evaluation

def evaluate(model, X_test, y_test, name):
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    within1 = np.mean(np.abs(y_test - preds) <= 1)
    within2 = np.mean(np.abs(y_test - preds) <= 2)
    within4 = np.mean(np.abs(y_test - preds) <= 4)

    print(f"\nModel: {name}")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAE: {mae:.3f}")
    print(f"R²: {r2:.3f}")
    print(f"Within ±1 yd: {within1*100:.1f}% | ±2 yd: {within2*100:.1f}% | ±4 yd: {within4*100:.1f}%")
    return preds

preds_lin = evaluate(lin_model, X_test_scaled, y_test, "Linear Regression")
preds_rf = evaluate(rf_model, X_test, y_test, "Random Forest")
preds_lgb = evaluate(lgb_model, X_test, y_test, "LightGBM")

In [None]:
# STEP 9 — Visualize Results

plt.figure(figsize=(6,4))
plt.scatter(y_test, preds_rf, alpha=0.3)
plt.xlabel("Actual Yards")
plt.ylabel("Predicted Yards (RF)")
plt.title("Predicted vs Actual Yards — Random Forest")
plt.show()

# Feature Importance for LightGBM
importances = pd.Series(lgb_model.feature_importances_, index=X.columns)
importances.nlargest(10).plot(kind='barh', figsize=(6,4))
plt.title("Top 10 Important Features (LightGBM)")
plt.show()