In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

# NBA Game Prediction Model

This notebook implements a Random Forest model to predict NBA game outcomes based on team statistics.

In [2]:
# Read interim data
train_data = pd.read_csv('C:/Users/poke5/Desktop/Projects/NBA_Prediction/data/processed/train.csv')
test_data = pd.read_csv('C:/Users/poke5/Desktop/Projects/NBA_Prediction/data/processed/test.csv')
validation_data = pd.read_csv('C:/Users/poke5/Desktop/Projects/NBA_Prediction/data/processed/validation.csv')

In [3]:
def make_matchup_df(df):
    """Create simple feature matrix from games data"""
    features = []
    labels = []
    
    for gid, game in df.groupby('GAME_ID'):
        if len(game) != 2:
            continue
            
        team1 = game.iloc[0]
        team2 = game.iloc[1]
        
        # Get just the rolling features
        feature_cols = [col for col in df.columns if col.startswith('r10_')]
        
        # Calculate differences
        feature_vector = []
        for col in feature_cols:
            diff = float(team1[col]) - float(team2[col])
            feature_vector.append(diff)
            
        # Add home advantage
        feature_vector.append(1 if 'vs.' in team1['MATCHUP'] else 0)
        
        # Add to lists
        features.append(feature_vector)
        labels.append(1 if team1['WL'] == 'W' else 0)
            
    return np.array(features), np.array(labels)

In [4]:
# Create training dataset
X, y = make_matchup_df(train_data)

# Print dataset info
print(f"Dataset shape: {X.shape}")
print(f"Number of games: {len(X)}")
print(f"Number of features: {X.shape[1]}")
print(f"Class balance (wins/losses):", dict(zip(*np.unique(y, return_counts=True))))

Dataset shape: (11650, 8)
Number of games: 11650
Number of features: 8
Class balance (wins/losses): {np.int64(0): np.int64(5821), np.int64(1): np.int64(5829)}


In [5]:
# Train/test split and model training
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_tr, y_tr)
y_pred = clf.predict(X_val)

# Print evaluation metrics
print('\nModel Evaluation:')
print(f'Training set size: {len(X_tr)} samples')
print(f'Validation set size: {len(X_val)} samples')
print(f'Accuracy on validation set: {accuracy_score(y_val, y_pred):.3f}')


Model Evaluation:
Training set size: 9320 samples
Validation set size: 2330 samples
Accuracy on validation set: 0.604


In [6]:
def predict_matchup(team1_abbr, team2_abbr, is_team1_home=True):
    """Simple prediction for a matchup between two teams"""
    # Get most recent stats for both teams
    team1_data = train_data[train_data['TEAM_ABBREVIATION'] == team1_abbr].iloc[-1]
    team2_data = train_data[train_data['TEAM_ABBREVIATION'] == team2_abbr].iloc[-1]
    
    # Just use the basic rolling stats and home advantage
    features_to_use = [col for col in train_data.columns if col.startswith('r10_')]
    
    # Build feature vector (differences between teams)
    feature_vector = []
    for col in features_to_use:
        diff = float(team1_data[col]) - float(team2_data[col])
        feature_vector.append(diff)
    
    # Add home court advantage
    feature_vector.append(1 if is_team1_home else 0)
    
    # Make prediction
    X_pred = np.array([feature_vector])
    prob = clf.predict_proba(X_pred)[0][1]
    pred = clf.predict(X_pred)[0]
    winner = team1_abbr if pred == 1 else team2_abbr
    
    return {
        'winner': winner,
        'probability': prob
    }

In [7]:
# Calculate team averages using the latest stats for each team
features_to_use = [col for col in train_data.columns if col.startswith('r10_')]
team_avgs = {}

for team in train_data['TEAM_ABBREVIATION'].unique():
    team_recent = train_data[train_data['TEAM_ABBREVIATION'] == team].iloc[-1]
    team_avgs[team] = team_recent[features_to_use]

team_avgs = pd.DataFrame(team_avgs).T 
print(team_avgs.head())

    r10_MissedFG r10_MissedFT  r10_TSA r10_TS_Pct r10_FG_Eff r10_RebRatio  \
POR         48.3          4.8  101.672  58.587483   0.472266     0.224538   
HOU         50.1          4.4  101.376  58.583486   0.440646     0.191088   
BOS         46.3          5.2   98.884  58.723955   0.465237     0.210166   
MIA         48.1          4.8   97.572  56.989861   0.444789     0.191539   
UTA         44.8          5.4   94.756  58.827994   0.467843     0.186104   

       r10_TS  
POR  0.585875  
HOU  0.585835  
BOS   0.58724  
MIA  0.569899  
UTA   0.58828  


In [8]:
# Save model and team averages for PredictionSimulator
joblib.dump(clf, 'rf_model.pkl')
team_avgs.to_csv('team_averages.csv')
print("Model and team averages exported for use in PredictionSimulator")

Model and team averages exported for use in PredictionSimulator
