In [6]:
import os
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib


In [7]:
# Load datasets
train_2007 = pd.read_csv("matchups-2007.csv")
train_2008 = pd.read_csv("matchups-2008.csv")
train_2009 = pd.read_csv("matchups-2009.csv")
train_2010 = pd.read_csv("matchups-2010.csv")
train_2011 = pd.read_csv("matchups-2011.csv")
train_2012 = pd.read_csv("matchups-2012.csv")
train_2013 = pd.read_csv("matchups-2013.csv")
train_2014 = pd.read_csv("matchups-2014.csv")
train_2015 = pd.read_csv("matchups-2015.csv")

all_seasons = {
    2007: train_2007, 2008: train_2008, 2009: train_2009,
    2010: train_2010, 2011: train_2011, 2012: train_2012,
    2013: train_2013, 2014: train_2014, 2015: train_2015
}

In [8]:
# Encode players across all seasons
player_encoder = LabelEncoder()
all_players = set()
for df in all_seasons.values():
    df = df[df["outcome"] == 1]
    for col in [f"home_{i}" for i in range(5)] + [f"away_{i}" for i in range(5)]:
        all_players.update(df[col].dropna().unique())
player_encoder.fit(list(all_players))

In [9]:
# Create model directory
model_dir = "season_models"
os.makedirs(model_dir, exist_ok=True)

In [10]:
# Train model per season
for season, df in all_seasons.items():
    df = df[df["outcome"] == 1].reset_index(drop=True)
    rows = []

    for _, row in df.iterrows():
        masked_idx = random.choice(range(5))
        features = [row[f"home_{i}"] for i in range(5) if i != masked_idx]
        features += [row[f"away_{i}"] for i in range(5)]

        rows.append({
            "features": features,
            "label": row[f"home_{masked_idx}"],
            "home_team": row["home_team"],
            "away_team": row["away_team"],
            "starting_min": row["starting_min"]
        })

    feature_matrix = []
    labels = []
    for row in rows:
        try:
            encoded_feats = player_encoder.transform(row["features"])
        except:
            continue
        encoded_feats = list(encoded_feats)
        encoded_feats.append(row["starting_min"])
        encoded_feats.append(hash(row["home_team"]) % 1000)
        encoded_feats.append(hash(row["away_team"]) % 1000)
        feature_matrix.append(encoded_feats)
        labels.append(row["label"])

    if len(feature_matrix) == 0:
        continue

    X = np.array(feature_matrix)
    y = player_encoder.transform(labels)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    joblib.dump(model, os.path.join(model_dir, f"rf_{season}.pkl"))

print("✅ All season models trained and saved.")


✅ All season models trained and saved.


## part 2

In [11]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from scipy.stats import mode

In [19]:
# Load datasets explicitly
train_2007 = pd.read_csv("matchups-2007.csv")
train_2008 = pd.read_csv("matchups-2008.csv")
train_2009 = pd.read_csv("matchups-2009.csv")
train_2010 = pd.read_csv("matchups-2010.csv")
train_2011 = pd.read_csv("matchups-2011.csv")
train_2012 = pd.read_csv("matchups-2012.csv")
train_2013 = pd.read_csv("matchups-2013.csv")
train_2014 = pd.read_csv("matchups-2014.csv")
train_2015 = pd.read_csv("matchups-2015.csv")
test_df = pd.read_csv("NBA_test.csv")
true_labels = pd.read_csv("NBA_test_labels.csv")

In [20]:
# Combine all train data into one list
all_train_dfs = [
    train_2007, train_2008, train_2009, train_2010, train_2011,
    train_2012, train_2013, train_2014, train_2015
]

In [21]:
# Load trained models
model_dir = "season_models"
if not os.path.exists(model_dir):
    raise FileNotFoundError("Trained model directory 'season_models' not found. Please run the training script first.")
model_files = sorted([f for f in os.listdir(model_dir) if f.endswith(".pkl")])
models = [joblib.load(os.path.join(model_dir, mf)) for mf in model_files]

In [22]:

# Load player encoder from training data
all_players = set()
for df in all_train_dfs:
    for col in [f"home_{i}" for i in range(5)] + [f"away_{i}" for i in range(5)]:
        all_players.update(df[col].dropna().unique())
player_encoder = LabelEncoder()
player_encoder.fit(list(all_players))


In [23]:
# Predict missing player per row
predictions = []
for idx, row in test_df.iterrows():
    # Identify which home player is missing
    home_players = [row[f"home_{i}"] for i in range(5)]
    masked_idx = home_players.index("?")
    features = [p for p in home_players if p != "?"] + [row[f"away_{i}"] for i in range(5)]

    # Handle possible unseen players
    try:
        encoded_feats = player_encoder.transform(features)
    except ValueError as e:
        print(f"Skipping row {idx} due to unseen player in features: {e}")
        continue

    encoded_feats = list(encoded_feats)
    encoded_feats.append(row["starting_min"])
    encoded_feats.append(hash(row["home_team"]) % 1000)
    encoded_feats.append(hash(row["away_team"]) % 1000)

    X = np.array(encoded_feats).reshape(1, -1)

    # Predict using all models
    model_preds = [model.predict(X)[0] for model in models]

    # Majority vote (fix mode structure to avoid IndexError)
    final_pred = mode(model_preds, keepdims=False).mode
    player_name = player_encoder.inverse_transform([final_pred])[0]

    predictions.append({
        "Game_ID": idx,
        "Home_Team": row["home_team"],
        "Fifth_Player": player_name
    })

Skipping row 901 due to unseen player in features: y contains previously unseen labels: np.str_('Christian Wood')
Skipping row 902 due to unseen player in features: y contains previously unseen labels: np.str_('Xavier Munford')
Skipping row 903 due to unseen player in features: y contains previously unseen labels: np.str_('Stanley Johnson')
Skipping row 904 due to unseen player in features: y contains previously unseen labels: np.str_('Karl-Anthony Towns')
Skipping row 905 due to unseen player in features: y contains previously unseen labels: np.str_('Karl-Anthony Towns')
Skipping row 906 due to unseen player in features: y contains previously unseen labels: np.str_('Mario Hezonja')
Skipping row 907 due to unseen player in features: y contains previously unseen labels: np.str_('Chris McCullough')
Skipping row 908 due to unseen player in features: y contains previously unseen labels: np.str_('Nemanja Bjelica')
Skipping row 909 due to unseen player in features: y contains previously unse

In [25]:
!pip install openpyxl


Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [26]:
# Convert to DataFrame and save to Excel
pred_df = pd.DataFrame(predictions)
pred_df.to_excel("NBA_predictions.xlsx", index=False)

In [27]:
# Evaluate accuracy
true_players = true_labels["removed_value"].values[:len(pred_df)]
predicted_players = pred_df["Fifth_Player"].values

correct = sum([1 for t, p in zip(true_players, predicted_players) if t == p])
accuracy = correct / len(true_players)

print(f"Prediction Accuracy: {accuracy:.4f}")

Prediction Accuracy: 0.0493
