### Import necessary libaries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from nba_api.stats.endpoints import leaguegamefinder

### Get game logs from NBA teams only using nba_api

In [3]:
games = leaguegamefinder.LeagueGameFinder().get_data_frames()[0]
nba_teams = ['ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']
games = games[games['TEAM_ABBREVIATION'].isin(nba_teams)]


In [4]:
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
8,22024,1610612743,DEN,Denver Nuggets,22400763,2025-02-10,DEN vs. POR,W,239,146,...,0.9,6,32,38,44,12,2,8,23,29.0
9,22024,1610612744,GSW,Golden State Warriors,22400760,2025-02-10,GSW @ MIL,W,238,125,...,0.789,15,33,48,27,16,2,9,23,14.0
10,22024,1610612738,BOS,Boston Celtics,22400759,2025-02-10,BOS @ MIA,W,239,103,...,1.0,3,49,52,24,4,5,6,11,18.0
11,22024,1610612753,ORL,Orlando Magic,22400756,2025-02-10,ORL vs. ATL,L,241,106,...,0.833,7,29,36,22,10,8,17,21,-6.0
13,22024,1610612760,OKC,Oklahoma City Thunder,22400761,2025-02-10,OKC vs. NOP,W,241,137,...,0.75,10,35,45,29,15,8,10,20,36.0


### Convert date to appropriate data type

In [5]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])
games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14567 entries, 8 to 29999
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   SEASON_ID          14567 non-null  object        
 1   TEAM_ID            14567 non-null  int64         
 2   TEAM_ABBREVIATION  14567 non-null  object        
 3   TEAM_NAME          14567 non-null  object        
 4   GAME_ID            14567 non-null  object        
 5   GAME_DATE          14567 non-null  datetime64[ns]
 6   MATCHUP            14567 non-null  object        
 7   WL                 14559 non-null  object        
 8   MIN                14567 non-null  int64         
 9   PTS                14567 non-null  int64         
 10  FGM                14567 non-null  int64         
 11  FGA                14567 non-null  int64         
 12  FG_PCT             14564 non-null  float64       
 13  FG3M               14567 non-null  int64         
 14  FG3A       

In [6]:
team_games = games[games['TEAM_ABBREVIATION'] == 'ATL']
recent_games = team_games.nlargest(10, 'GAME_DATE')

In [7]:
recent_games

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
33,22024,1610612737,ATL,Atlanta Hawks,22400756,2025-02-10,ATL @ ORL,W,240,112,...,0.857,6,33,39,29,10,3,16,17,6.0
64,22024,1610612737,ATL,Atlanta Hawks,22400744,2025-02-08,ATL @ WAS,W,240,125,...,0.765,8,33,41,36,6,7,11,19,14.0
96,22024,1610612737,ATL,Atlanta Hawks,22400736,2025-02-07,ATL vs. MIL,W,240,115,...,0.667,11,41,52,29,6,6,11,14,5.0
156,22024,1610612737,ATL,Atlanta Hawks,22400719,2025-02-05,ATL vs. SAS,L,240,125,...,0.722,8,37,45,33,5,5,20,14,-1.0
209,22024,1610612737,ATL,Atlanta Hawks,22400701,2025-02-03,ATL @ DET,W,240,132,...,0.821,5,32,37,27,8,7,12,27,2.0
255,22024,1610612737,ATL,Atlanta Hawks,22400686,2025-02-01,ATL @ IND,L,240,127,...,0.719,11,33,44,31,6,4,12,23,-5.0
324,22024,1610612737,ATL,Atlanta Hawks,22400675,2025-01-30,ATL @ CLE,L,240,115,...,0.741,10,26,36,28,10,1,11,18,-22.0
371,22024,1610612737,ATL,Atlanta Hawks,22400532,2025-01-28,ATL vs. HOU,L,240,96,...,0.813,9,31,40,19,11,5,16,18,-4.0
398,22024,1610612737,ATL,Atlanta Hawks,22400656,2025-01-27,ATL @ MIN,L,241,92,...,0.667,14,37,51,24,10,3,21,20,-8.0
461,22024,1610612737,ATL,Atlanta Hawks,22400639,2025-01-25,ATL vs. TOR,L,238,94,...,0.68,8,35,43,28,14,5,22,20,-23.0


### General structure of stats that I want to use for performance analysis

In [8]:
record = recent_games['WL'].value_counts().to_dict()
wins = record.get('W', 0)
losses = record.get('L', 0)

stats = {
        'avg_plus_minus': recent_games['PLUS_MINUS'].mean(),
        'avg_points': recent_games['PTS'].mean(),
        'avg_points_allowed': recent_games['PTS'].mean() - recent_games['PLUS_MINUS'].mean(),
        'avg_fg_pct': recent_games['FG_PCT'].mean().round(3),
        'avg_fg3_pct': recent_games['FG3_PCT'].mean().round(3),
        'avg_ast': recent_games['AST'].mean(),
        'record': f"{wins}-{losses}"
    }

In [9]:
stats

{'avg_plus_minus': -3.6,
 'avg_points': 113.3,
 'avg_points_allowed': 116.89999999999999,
 'avg_fg_pct': 0.466,
 'avg_fg3_pct': 0.348,
 'avg_ast': 28.4,
 'record': '4-6'}

### Functions that get all stats from last 10 games and then create an itemized stats dataframe for the respective team

In [10]:
def last_n_games(games_df, team_abbrev, n=10):
  team_games = games_df[games_df['TEAM_ABBREVIATION'] == team_abbrev]
  return team_games.nlargest(n, 'GAME_DATE')

def get_team_performance(games_df, team_abbrev, n=10):
  recent_games = last_n_games(games_df, team_abbrev, n)

  record = recent_games['WL'].value_counts().to_dict()
  wins = record.get('W', 0)
  losses = record.get('L', 0)

  stats = {
        'avg_plus_minus': round(recent_games['PLUS_MINUS'].mean(), 2),
        'avg_points': round(recent_games['PTS'].mean(), 2),
        'avg_points_allowed': round(recent_games['PTS'].mean() - recent_games['PLUS_MINUS'].mean(), 2),
        'avg_fg_pct': round(recent_games['FG_PCT'].mean() * 100, 2),
        'avg_fg3_pct': round(recent_games['FG3_PCT'].mean() * 100, 2),
        'avg_ast': round(recent_games['AST'].mean(), 2),
        'record': f"{wins}-{losses}"
    }

  return stats

### Function that compares the performance of two teams side by side

In [11]:
def compare_teams(games_df, team1_abbrev, team2_abbrev, n=10):
  team1_stats = get_team_performance(games_df, team1_abbrev, n)
  team2_stats = get_team_performance(games_df, team2_abbrev, n)

  comparison = pd.DataFrame({
    f'{team1_abbrev}': [
        team1_stats['avg_plus_minus'],
        team1_stats['avg_points'],
        team1_stats['avg_points_allowed'],
        team1_stats['avg_fg_pct'],
        team1_stats['avg_fg3_pct'],
        team1_stats['avg_ast'],
        team1_stats['record']
    ],
    f'{team2_abbrev}': [
        team2_stats['avg_plus_minus'],
        team2_stats['avg_points'],
        team2_stats['avg_points_allowed'],
        team2_stats['avg_fg_pct'],
        team2_stats['avg_fg3_pct'],
        team2_stats['avg_ast'],
        team2_stats['record']
    ]
    }, index=['Plus/Minus', 'Points', 'Points Allowed', 'FG%', '3P%', 'Assists', 'Record'])
  
  return comparison

In [12]:
comparison = compare_teams(games, 'OKC', 'BOS', 10)
comparison

Unnamed: 0,OKC,BOS
Plus/Minus,16.1,6.9
Points,125.7,115.4
Points Allowed,109.6,108.5
FG%,48.31,46.96
3P%,35.76,39.33
Assists,26.4,27.6
Record,8-2,7-3


In [13]:
team1_stats = get_team_performance(games, 'OKC', 10)
team2_stats = get_team_performance(games, 'BOS', 10)

In [14]:
features = pd.DataFrame({
  'pts_diff': [team1_stats['avg_points'] - team2_stats['avg_points']],
  'plus_minus_diff': [team1_stats['avg_plus_minus'] - team2_stats['avg_plus_minus']],
  'fg_pct_diff': [team1_stats['avg_fg_pct'] - team2_stats['avg_fg_pct']],
  'fg_pct3_diff': [team1_stats['avg_fg3_pct'] - team2_stats['avg_fg3_pct']],
  'ast_diff': [team1_stats['avg_ast'] - team2_stats['avg_ast']]
})

In [15]:
features

Unnamed: 0,pts_diff,plus_minus_diff,fg_pct_diff,fg_pct3_diff,ast_diff
0,10.3,9.2,1.35,-3.57,-1.2


### Function that prepares the features for two team comparison

In [16]:
def prepare_ml_features(games_df, team1, team2, n_games=10):
    """Prepare features for ML model from team comparison stats"""
    team1_stats = get_team_performance(games_df, team1, n_games)
    team2_stats = get_team_performance(games_df, team2, n_games)
    
    features = pd.DataFrame({
        'pts_diff': [team1_stats['avg_points'] - team2_stats['avg_points']],
        'plus_minus_diff': [team1_stats['avg_plus_minus'] - team2_stats['avg_plus_minus']],
        'fg_pct_diff': [team1_stats['avg_fg_pct'] - team2_stats['avg_fg_pct']],
        'fg3_pct_diff': [team1_stats['avg_fg3_pct'] - team2_stats['avg_fg3_pct']],
        'ast_diff': [team1_stats['avg_ast'] - team2_stats['avg_ast']]
    })
    
    return features

### Function to train the model based on historical data (365 days)

In [17]:
def train_model(games_df, train_period_days=365):
    """Train ML model on historical matchups"""
    cutoff_date = games_df['GAME_DATE'].max() - pd.Timedelta(days=train_period_days)
    recent_games = games_df[games_df['GAME_DATE'] >= cutoff_date]
    
    features_list = []
    results = []
 
    unique_games = recent_games.drop_duplicates(subset=['GAME_ID'])
    
    for _, game in unique_games.iterrows():
        home_team = game['TEAM_ABBREVIATION']
        away_team_row = recent_games[
            (recent_games['GAME_ID'] == game['GAME_ID']) & 
            (recent_games['TEAM_ABBREVIATION'] != home_team)
        ]
        if away_team_row.empty:
            continue
        away_team = away_team_row['TEAM_ABBREVIATION'].iloc[0]

        historical_games = games_df[games_df['GAME_DATE'] < game['GAME_DATE']]
        if len(historical_games) < 10: 
            continue
            
        features = prepare_ml_features(historical_games, home_team, away_team)
        features_list.append(features)

        result = 1 if game['WL'] == 'W' else 0
        results.append(result)
    
    X = pd.concat(features_list)
    y = pd.Series(results)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_scaled, y)
    
    return model, scaler

### Function that predicts probability of a win for a given team against given opponent

In [18]:
def predict_game(games_df, team1, team2, model, scaler):
    """Predict outcome of game between two teams"""
    features = prepare_ml_features(games_df, team1, team2)
    features_scaled = scaler.transform(features)
    
    win_prob = model.predict_proba(features_scaled)[0][1]
    
    return win_prob

In [19]:
model, scaler = train_model(games)
win_probability = predict_game(games, 'BOS', 'OKC', model, scaler)
print(f"Win probability for BOS: {win_probability:.1%}")

Win probability for BOS: 21.0%


### Function that gives the probabilities for both teams to use for presenting game outcome

In [20]:
def predict_both_teams(games, team1, team2, model, scaler):
    """
    Predicts the win probabilities for both teams in a game.

    Parameters:
        games (DataFrame): The dataset containing game data.
        team1 (str): The first team's abbreviation.
        team2 (str): The second team's abbreviation.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        tuple: Win probabilities for team1 and team2 as percentages.
    """
    prob_team1 = predict_game(games, team1, team2, model, scaler)
    prob_team2 = 1 - prob_team1

    return prob_team1 * 100, prob_team2 * 100

### Function to predict winner of game

In [21]:
def determine_winner(games, team1, team2, model, scaler):
    """
    Determines the winner based on the win probabilities of two teams.

    Parameters:
        games (DataFrame): The dataset containing game data.
        team1 (str): The first team's abbreviation.
        team2 (str): The second team's abbreviation.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        dict: A dictionary with win probabilities for both teams and the predicted winner.
    """
    prob_team1, prob_team2 = predict_both_teams(games, team1, team2, model, scaler)
    
    if prob_team1 > prob_team2:
        winner = team1
    elif prob_team2 > prob_team1:
        winner = team2
    else:
        winner = np.random.choice([team1, team2])  # In case the probabilities are exactly equal

    return {
        "team1": team1,
        "team1_prob": prob_team1,
        "team2": team2,
        "team2_prob": prob_team2,
        "winner": winner
    }

In [22]:
model, scaler = train_model(games)
result = determine_winner(games, 'BOS', 'OKC', model, scaler)

print(f"Win probability for {result['team1']}: {result['team1_prob']:.1f}%")
print(f"Win probability for {result['team2']}: {result['team2_prob']:.1f}%")
print(f"Predicted winner: {result['winner']}")

Win probability for BOS: 21.0%
Win probability for OKC: 79.0%
Predicted winner: OKC


In [23]:
# 12/22 games
home_teams = ['SAC', 'TOR', 'NOP']
away_teams = ['IND', 'HOU', 'DEN']


### Function that goes through game slate for a given day

In [24]:
def predict_games(home_teams, away_teams, games, model, scaler):
    """
    Predicts the winners for a list of home and away teams.

    Parameters:
        home_teams (list): List of home teams.
        away_teams (list): List of away teams.
        games (DataFrame): The dataset containing game data.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        DataFrame: A DataFrame with the game predictions and predicted winners.
    """
    results = []

    for home, away in zip(home_teams, away_teams):
        prediction = determine_winner(games, home, away, model, scaler)
        
        results.append({
            "home_team": home,
            "home_prob": prediction["team1_prob"],
            "away_team": away,
            "away_prob": prediction["team2_prob"],
            "winner": prediction["winner"]
        })
    
    results_df = pd.DataFrame(results)
    return results_df

# Example
home_teams = ['OKC', 'LAL', 'PHI', 'ORL', 'CLE', 'CHA', 'NYK', 'MIA', 'ATL', 'MEM', 'CHI', 'DAL', 'GSW', 'DEN']
away_teams = ['WAS', 'DET', 'SAS', 'BOS', 'UTA', 'HOU', 'TOR', 'BKN', 'MIN', 'LAC', 'MIL', 'POR', 'IND', 'PHX']
model, scaler = train_model(games)

todays_games = predict_games(home_teams, away_teams, games, model, scaler)

In [25]:
# 12/23 games
todays_games

Unnamed: 0,home_team,home_prob,away_team,away_prob,winner
0,OKC,78.0,WAS,22.0,OKC
1,LAL,63.0,DET,37.0,LAL
2,PHI,59.0,SAS,41.0,PHI
3,ORL,32.0,BOS,68.0,BOS
4,CLE,72.0,UTA,28.0,CLE
5,CHA,64.0,HOU,36.0,CHA
6,NYK,61.0,TOR,39.0,NYK
7,MIA,55.0,BKN,45.0,MIA
8,ATL,14.0,MIN,86.0,MIN
9,MEM,43.0,LAC,57.0,LAC


### Predicting the outcome of the Christmas slate

In [26]:
# # Christmas games
# home_teams_christmas = ['NYK', 'DAL', 'BOS', 'GSW', 'PHX']
# away_teams_christmas = ['SAS', 'MIN', 'PHI', 'LAL', 'DEN']
# christmas = predict_games(home_teams_christmas, away_teams_christmas, games, model, scaler)
# christmas

## Today's Games

In [27]:
home_teams = ['PHI', 'IND', 'CHI', 'PHX']
away_teams = ['TOR', 'NYK', 'DET', 'MEM']
outcomes = predict_games(home_teams, away_teams, games, model, scaler)
outcomes

Unnamed: 0,home_team,home_prob,away_team,away_prob,winner
0,PHI,68.0,TOR,32.0,PHI
1,IND,53.0,NYK,47.0,IND
2,CHI,44.0,DET,56.0,DET
3,PHX,57.0,MEM,43.0,PHX


Overall record in predicting winner starting Jan 27, 2025: 28-18