In [2]:
### ✅ 🏀 Step 1: Import Libraries and Handle Dependencies
import pandas as pd
import numpy as np
import glob
import random
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, top_k_accuracy_score
import lightgbm as lgb
import joblib

In [1]:
# NBA Fifth Player Prediction - End-to-End Pipeline

### Step 1: Import Libraries
import pandas as pd
import numpy as np
import glob
import random
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, top_k_accuracy_score
import lightgbm as lgb
import joblib

In [3]:
### 📁 Step 2: Load Datasets Explicitly
train_2007 = pd.read_csv("matchups-2007.csv")
train_2008 = pd.read_csv("matchups-2008.csv")
train_2009 = pd.read_csv("matchups-2009.csv")
train_2010 = pd.read_csv("matchups-2010.csv")
train_2011 = pd.read_csv("matchups-2011.csv")
train_2012 = pd.read_csv("matchups-2012.csv")
train_2013 = pd.read_csv("matchups-2013.csv")
train_2014 = pd.read_csv("matchups-2014.csv")
train_2015 = pd.read_csv("matchups-2015.csv")
test_data = pd.read_csv("NBA_test.csv")
test_labels = pd.read_csv("NBA_test_labels.csv")

In [4]:
# Combine training data
all_dfs = [
    train_2007, train_2008, train_2009, train_2010, train_2011,
    train_2012, train_2013, train_2014, train_2015
]
full_train_df = pd.concat(all_dfs, ignore_index=True)

In [5]:
### ✅ Step 3: Filter Winning Games Only
winning_df = full_train_df[full_train_df["outcome"] == 1]
home_players = winning_df[["home_0", "home_1", "home_2", "home_3", "home_4"]]
lineups = home_players.dropna().values.tolist()
lineups = [list(map(str, lineup)) for lineup in lineups]

In [6]:

### 🧠🔬 Step 4: Train Word2Vec Player Embeddings (Enhanced)

print("Training player embeddings with improved parameters...")
w2v_model = Word2Vec(sentences=lineups, vector_size=100, window=5, min_count=3, sg=1, workers=4, seed=42)


Training player embeddings with improved parameters...


In [7]:
### 🎯 Step 5: Generate Training Data with Full Context (Home + Away + Metadata)
print("Generating enhanced training dataset...")

X_vectors = []
y_labels = []
for i, lineup in enumerate(lineups):
    if all(player in w2v_model.wv for player in lineup):
        row = winning_df.iloc[i]  # Fetch row early for context
        dropped_idx = random.randint(0, 4)
        target_player = lineup[dropped_idx]
        remaining_players = lineup[:dropped_idx] + lineup[dropped_idx+1:]

        home_vector = np.mean([w2v_model.wv[player] for player in remaining_players], axis=0)
        away_players = [row['away_0'], row['away_1'], row['away_2'], row['away_3'], row['away_4']]
        away_vector = np.mean([w2v_model.wv[p] for p in away_players if p in w2v_model.wv], axis=0) if any(p in w2v_model.wv for p in away_players) else np.zeros(w2v_model.vector_size)

        team_vector = hash(row["home_team"]) % 10_000
        opp_team_vector = hash(row["away_team"]) % 10_000
        season_vector = int(row["season"]) % 2000
        context_features = np.array([team_vector, opp_team_vector, season_vector]) / 100.0

        avg_vector = np.concatenate([home_vector, away_vector, context_features])
        X_vectors.append(avg_vector)
        y_labels.append(target_player)

X_vectors = np.array(X_vectors)
y_labels = np.array(y_labels)


Generating enhanced training dataset...


In [8]:
### 🎓 Step 6: Encode Labels and Split Data (Handling Rare Classes)
label_encoder = LabelEncoder()
# Identify and remove rare classes (players who appear only once)
from collections import Counter

player_counts = Counter(y_labels)
filtered_X = []
filtered_y = []

for i, label in enumerate(y_labels):
    if player_counts[label] > 1:  # Keep only players appearing at least twice
        filtered_X.append(X_vectors[i])
        filtered_y.append(label)

# Adjust filtered_X to include context features
dim = len(filtered_X[0])
filtered_X = np.array([x if len(x) == dim else np.concatenate([x, np.zeros(dim - len(x))]) for x in filtered_X])
filtered_y = np.array(filtered_y)

y_encoded = label_encoder.fit_transform(filtered_y)
X_train, X_val, y_train, y_val = train_test_split(
    filtered_X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [9]:
### 🚀 Step 7: Train LightGBM Classifier
print("Training LightGBM model...")
lgb_model = lgb.LGBMClassifier(objective="multiclass", num_class=len(np.unique(y_train)), random_state=42, class_weight="balanced")
lgb_model.fit(X_train, y_train)

Training LightGBM model...




[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [10]:
### 📊 Step 8: Evaluate Model Performance
y_pred = lgb_model.predict(X_val)
y_pred_proba = lgb_model.predict_proba(X_val)
top1_acc = accuracy_score(y_val, y_pred)
top3_acc = top_k_accuracy_score(y_val, y_pred_proba, k=3, labels=np.arange(len(label_encoder.classes_)))
print("Top-1 Accuracy:", round(top1_acc * 100, 2), "%")
print("Top-3 Accuracy:", round(top3_acc * 100, 2), "%")



Top-1 Accuracy: 1.94 %
Top-3 Accuracy: 3.08 %


In [11]:
### 🔍 Step 9: Evaluate on Test Labels (if available)
print("Evaluating on labeled test data...")
if 'Game_ID' in test_labels.columns and 'Fifth_Player' in test_labels.columns:
    submission = pd.read_excel("submission.xlsx")
    merged = pd.merge(submission, test_labels, on=["Game_ID", "Home_Team"], suffixes=("_pred", "_true"))

    correct_top1 = merged["Fifth_Player_pred"] == merged["Fifth_Player_true"]
    test_top1_accuracy = correct_top1.sum() / len(correct_top1)
    print("🎯 Test Set Top-1 Accuracy (exact match):", round(test_top1_accuracy * 100, 2), "%")

    y_true_test = label_encoder.transform(merged["Fifth_Player_true"])
    test_pred_probs = lgb_model.predict_proba(X_val)[:, :len(label_encoder.classes_)]

    test_top3_accuracy = top_k_accuracy_score(y_true_test, test_pred_probs, k=3, labels=np.arange(len(label_encoder.classes_)))
    print("🥉 Test Set Top-3 Accuracy (true player in top 3 guesses):", round(test_top3_accuracy * 100, 2), "%")


Evaluating on labeled test data...


In [12]:
### 💾 Step 10: Save Models
joblib.dump(w2v_model, "player_embeddings.model")
joblib.dump(lgb_model, "lgb_player_classifier.model")
joblib.dump(label_encoder, "player_label_encoder.pkl")


['player_label_encoder.pkl']