### Import necessary libaries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from nba_api.stats.endpoints import leaguegamefinder

### Get game logs from NBA teams only using nba_api

In [3]:
games = leaguegamefinder.LeagueGameFinder().get_data_frames()[0]
nba_teams = ['ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']
games = games[games['TEAM_ABBREVIATION'].isin(nba_teams)]


In [4]:
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22024,1610612747,LAL,Los Angeles Lakers,22401038,2025-03-24,LAL @ ORL,L,240,106,...,0.786,11,28,39,19,7,1,10,20,-12.0
1,22024,1610612756,PHX,Phoenix Suns,22401043,2025-03-24,PHX vs. MIL,W,241,108,...,0.889,14,38,52,25,2,4,16,21,2.0
3,22024,1610612755,PHI,Philadelphia 76ers,22401041,2025-03-24,PHI @ NOP,L,240,99,...,0.895,12,25,37,19,11,6,11,16,-13.0
5,22024,1610612743,DEN,Denver Nuggets,22401042,2025-03-24,DEN vs. CHI,L,240,119,...,0.667,17,35,52,34,5,9,12,16,-10.0
6,22024,1610612740,NOP,New Orleans Pelicans,22401041,2025-03-24,NOP vs. PHI,W,241,112,...,0.846,13,38,51,36,11,5,16,17,13.0


### Convert date to appropriate data type

In [5]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])
games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14243 entries, 0 to 29999
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   SEASON_ID          14243 non-null  object        
 1   TEAM_ID            14243 non-null  int64         
 2   TEAM_ABBREVIATION  14243 non-null  object        
 3   TEAM_NAME          14243 non-null  object        
 4   GAME_ID            14243 non-null  object        
 5   GAME_DATE          14243 non-null  datetime64[ns]
 6   MATCHUP            14243 non-null  object        
 7   WL                 14235 non-null  object        
 8   MIN                14243 non-null  int64         
 9   PTS                14243 non-null  int64         
 10  FGM                14243 non-null  int64         
 11  FGA                14243 non-null  int64         
 12  FG_PCT             14240 non-null  float64       
 13  FG3M               14243 non-null  int64         
 14  FG3A       

In [6]:
team_games = games[games['TEAM_ABBREVIATION'] == 'ATL']
recent_games = team_games.nlargest(10, 'GAME_DATE')

In [7]:
recent_games

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
48,22024,1610612737,ATL,Atlanta Hawks,22401031,2025-03-23,ATL vs. PHI,W,240,132,...,0.864,10,29,39,35,11,4,16,26,13.0
56,22024,1610612737,ATL,Atlanta Hawks,22401025,2025-03-22,ATL vs. GSW,W,240,124,...,0.611,10,36,46,37,5,3,14,23,9.0
189,22024,1610612737,ATL,Atlanta Hawks,22400993,2025-03-18,ATL @ CHA,W,241,134,...,0.8,11,35,46,27,12,8,10,17,32.0
221,22024,1610612737,ATL,Atlanta Hawks,22400978,2025-03-16,ATL @ BKN,L,241,114,...,0.792,10,28,38,27,12,11,14,17,-8.0
296,22024,1610612737,ATL,Atlanta Hawks,22400960,2025-03-14,ATL vs. LAC,L,238,98,...,0.882,13,31,44,25,8,6,18,18,-23.0
342,22024,1610612737,ATL,Atlanta Hawks,22400945,2025-03-12,ATL vs. CHA,W,241,123,...,0.897,13,30,43,26,13,6,13,16,13.0
394,22024,1610612737,ATL,Atlanta Hawks,22400928,2025-03-10,ATL vs. PHI,W,240,132,...,0.786,14,34,48,34,4,6,15,18,9.0
464,22024,1610612737,ATL,Atlanta Hawks,22400914,2025-03-08,ATL vs. IND,W,239,120,...,0.952,16,32,48,23,10,10,16,22,2.0
509,22024,1610612737,ATL,Atlanta Hawks,22400899,2025-03-06,ATL vs. IND,W,241,124,...,0.778,11,32,43,32,5,4,8,16,6.0
560,22024,1610612737,ATL,Atlanta Hawks,22400884,2025-03-04,ATL vs. MIL,L,240,121,...,0.765,11,36,47,33,10,6,11,21,-6.0


### General structure of stats that I want to use for performance analysis

In [8]:
record = recent_games['WL'].value_counts().to_dict()
wins = record.get('W', 0)
losses = record.get('L', 0)

stats = {
        'avg_plus_minus': recent_games['PLUS_MINUS'].mean(),
        'avg_points': recent_games['PTS'].mean(),
        'avg_points_allowed': recent_games['PTS'].mean() - recent_games['PLUS_MINUS'].mean(),
        'avg_fg_pct': recent_games['FG_PCT'].mean().round(3),
        'avg_fg3_pct': recent_games['FG3_PCT'].mean().round(3),
        'avg_ast': recent_games['AST'].mean(),
        'record': f"{wins}-{losses}"
    }

In [9]:
stats

{'avg_plus_minus': 4.7,
 'avg_points': 122.2,
 'avg_points_allowed': 117.5,
 'avg_fg_pct': 0.496,
 'avg_fg3_pct': 0.375,
 'avg_ast': 29.9,
 'record': '7-3'}

### Functions that get all stats from last 10 games and then create an itemized stats dataframe for the respective team

In [10]:
def last_n_games(games_df, team_abbrev, n=10):
  team_games = games_df[games_df['TEAM_ABBREVIATION'] == team_abbrev]
  return team_games.nlargest(n, 'GAME_DATE')

def get_team_performance(games_df, team_abbrev, n=10):
  recent_games = last_n_games(games_df, team_abbrev, n)

  record = recent_games['WL'].value_counts().to_dict()
  wins = record.get('W', 0)
  losses = record.get('L', 0)

  stats = {
        'avg_plus_minus': round(recent_games['PLUS_MINUS'].mean(), 2),
        'avg_points': round(recent_games['PTS'].mean(), 2),
        'avg_points_allowed': round(recent_games['PTS'].mean() - recent_games['PLUS_MINUS'].mean(), 2),
        'avg_fg_pct': round(recent_games['FG_PCT'].mean() * 100, 2),
        'avg_fg3_pct': round(recent_games['FG3_PCT'].mean() * 100, 2),
        'avg_ast': round(recent_games['AST'].mean(), 2),
        'record': f"{wins}-{losses}"
    }

  return stats

### Function that compares the performance of two teams side by side

In [11]:
def compare_teams(games_df, team1_abbrev, team2_abbrev, n=10):
  team1_stats = get_team_performance(games_df, team1_abbrev, n)
  team2_stats = get_team_performance(games_df, team2_abbrev, n)

  comparison = pd.DataFrame({
    f'{team1_abbrev}': [
        team1_stats['avg_plus_minus'],
        team1_stats['avg_points'],
        team1_stats['avg_points_allowed'],
        team1_stats['avg_fg_pct'],
        team1_stats['avg_fg3_pct'],
        team1_stats['avg_ast'],
        team1_stats['record']
    ],
    f'{team2_abbrev}': [
        team2_stats['avg_plus_minus'],
        team2_stats['avg_points'],
        team2_stats['avg_points_allowed'],
        team2_stats['avg_fg_pct'],
        team2_stats['avg_fg3_pct'],
        team2_stats['avg_ast'],
        team2_stats['record']
    ]
    }, index=['Plus/Minus', 'Points', 'Points Allowed', 'FG%', '3P%', 'Assists', 'Record'])
  
  return comparison

In [12]:
comparison = compare_teams(games, 'OKC', 'BOS', 10)
comparison

Unnamed: 0,OKC,BOS
Plus/Minus,14.4,10.3
Points,121.0,114.5
Points Allowed,106.6,104.2
FG%,49.15,47.13
3P%,38.64,36.23
Assists,26.5,26.6
Record,9-1,9-1


In [13]:
team1_stats = get_team_performance(games, 'OKC', 10)
team2_stats = get_team_performance(games, 'BOS', 10)

In [14]:
features = pd.DataFrame({
  'pts_diff': [team1_stats['avg_points'] - team2_stats['avg_points']],
  'plus_minus_diff': [team1_stats['avg_plus_minus'] - team2_stats['avg_plus_minus']],
  'fg_pct_diff': [team1_stats['avg_fg_pct'] - team2_stats['avg_fg_pct']],
  'fg_pct3_diff': [team1_stats['avg_fg3_pct'] - team2_stats['avg_fg3_pct']],
  'ast_diff': [team1_stats['avg_ast'] - team2_stats['avg_ast']]
})

In [15]:
features

Unnamed: 0,pts_diff,plus_minus_diff,fg_pct_diff,fg_pct3_diff,ast_diff
0,6.5,4.1,2.02,2.41,-0.1


### Function that prepares the features for two team comparison

In [16]:
def prepare_ml_features(games_df, team1, team2, n_games=10):
    """Prepare features for ML model from team comparison stats"""
    team1_stats = get_team_performance(games_df, team1, n_games)
    team2_stats = get_team_performance(games_df, team2, n_games)
    
    features = pd.DataFrame({
        'pts_diff': [team1_stats['avg_points'] - team2_stats['avg_points']],
        'plus_minus_diff': [team1_stats['avg_plus_minus'] - team2_stats['avg_plus_minus']],
        'fg_pct_diff': [team1_stats['avg_fg_pct'] - team2_stats['avg_fg_pct']],
        'fg3_pct_diff': [team1_stats['avg_fg3_pct'] - team2_stats['avg_fg3_pct']],
        'ast_diff': [team1_stats['avg_ast'] - team2_stats['avg_ast']]
    })
    
    return features

### Function to train the model based on historical data (365 days)

In [17]:
def train_model(games_df, train_period_days=365):
    """Train ML model on historical matchups"""
    cutoff_date = games_df['GAME_DATE'].max() - pd.Timedelta(days=train_period_days)
    recent_games = games_df[games_df['GAME_DATE'] >= cutoff_date]
    
    features_list = []
    results = []
 
    unique_games = recent_games.drop_duplicates(subset=['GAME_ID'])
    
    for _, game in unique_games.iterrows():
        home_team = game['TEAM_ABBREVIATION']
        away_team_row = recent_games[
            (recent_games['GAME_ID'] == game['GAME_ID']) & 
            (recent_games['TEAM_ABBREVIATION'] != home_team)
        ]
        if away_team_row.empty:
            continue
        away_team = away_team_row['TEAM_ABBREVIATION'].iloc[0]

        historical_games = games_df[games_df['GAME_DATE'] < game['GAME_DATE']]
        if len(historical_games) < 10: 
            continue
            
        features = prepare_ml_features(historical_games, home_team, away_team)
        features_list.append(features)

        result = 1 if game['WL'] == 'W' else 0
        results.append(result)
    
    X = pd.concat(features_list)
    y = pd.Series(results)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_scaled, y)
    
    return model, scaler

### Function that predicts probability of a win for a given team against given opponent

In [18]:
def predict_game(games_df, team1, team2, model, scaler):
    """Predict outcome of game between two teams"""
    features = prepare_ml_features(games_df, team1, team2)
    features_scaled = scaler.transform(features)
    
    win_prob = model.predict_proba(features_scaled)[0][1]
    
    return win_prob

In [19]:
model, scaler = train_model(games)
win_probability = predict_game(games, 'BOS', 'OKC', model, scaler)
print(f"Win probability for BOS: {win_probability:.1%}")

Win probability for BOS: 25.0%


### Function that gives the probabilities for both teams to use for presenting game outcome

In [20]:
def predict_both_teams(games, team1, team2, model, scaler):
    """
    Predicts the win probabilities for both teams in a game.

    Parameters:
        games (DataFrame): The dataset containing game data.
        team1 (str): The first team's abbreviation.
        team2 (str): The second team's abbreviation.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        tuple: Win probabilities for team1 and team2 as percentages.
    """
    prob_team1 = predict_game(games, team1, team2, model, scaler)
    prob_team2 = 1 - prob_team1

    return prob_team1 * 100, prob_team2 * 100

### Function to predict winner of game

In [21]:
def determine_winner(games, team1, team2, model, scaler):
    """
    Determines the winner based on the win probabilities of two teams.

    Parameters:
        games (DataFrame): The dataset containing game data.
        team1 (str): The first team's abbreviation.
        team2 (str): The second team's abbreviation.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        dict: A dictionary with win probabilities for both teams and the predicted winner.
    """
    prob_team1, prob_team2 = predict_both_teams(games, team1, team2, model, scaler)
    
    if prob_team1 > prob_team2:
        winner = team1
    elif prob_team2 > prob_team1:
        winner = team2
    else:
        winner = np.random.choice([team1, team2])  # In case the probabilities are exactly equal

    return {
        "team1": team1,
        "team1_prob": prob_team1,
        "team2": team2,
        "team2_prob": prob_team2,
        "winner": winner
    }

In [22]:
model, scaler = train_model(games)
result = determine_winner(games, 'BOS', 'OKC', model, scaler)

print(f"Win probability for {result['team1']}: {result['team1_prob']:.1f}%")
print(f"Win probability for {result['team2']}: {result['team2_prob']:.1f}%")
print(f"Predicted winner: {result['winner']}")

Win probability for BOS: 25.0%
Win probability for OKC: 75.0%
Predicted winner: OKC


### Function that goes through game slate for a given day

In [23]:
def predict_games(home_teams, away_teams, games, model, scaler):
    """
    Predicts the winners for a list of home and away teams.

    Parameters:
        home_teams (list): List of home teams.
        away_teams (list): List of away teams.
        games (DataFrame): The dataset containing game data.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        DataFrame: A DataFrame with the game predictions and predicted winners.
    """
    results = []

    for home, away in zip(home_teams, away_teams):
        prediction = determine_winner(games, home, away, model, scaler)
        
        results.append({
            "home_team": home,
            "home_prob": prediction["team1_prob"],
            "away_team": away,
            "away_prob": prediction["team2_prob"],
            "winner": prediction["winner"]
        })
    
    results_df = pd.DataFrame(results)
    return results_df

# Example
home_teams = ['OKC', 'LAL', 'PHI', 'ORL', 'CLE', 'CHA', 'NYK', 'MIA', 'ATL', 'MEM', 'CHI', 'DAL', 'GSW', 'DEN']
away_teams = ['WAS', 'DET', 'SAS', 'BOS', 'UTA', 'HOU', 'TOR', 'BKN', 'MIN', 'LAC', 'MIL', 'POR', 'IND', 'PHX']
model, scaler = train_model(games)

todays_games = predict_games(home_teams, away_teams, games, model, scaler)

In [24]:
# 12/23 games
todays_games

Unnamed: 0,home_team,home_prob,away_team,away_prob,winner
0,OKC,54.0,WAS,46.0,OKC
1,LAL,22.333333,DET,77.666667,DET
2,PHI,34.0,SAS,66.0,SAS
3,ORL,44.0,BOS,56.0,BOS
4,CLE,91.0,UTA,9.0,CLE
5,CHA,8.0,HOU,92.0,HOU
6,NYK,56.0,TOR,44.0,NYK
7,MIA,34.0,BKN,66.0,BKN
8,ATL,28.0,MIN,72.0,MIN
9,MEM,29.0,LAC,71.0,LAC


### Predicting the outcome of the Christmas slate

In [25]:
# # Christmas games
# home_teams_christmas = ['NYK', 'DAL', 'BOS', 'GSW', 'PHX']
# away_teams_christmas = ['SAS', 'MIN', 'PHI', 'LAL', 'DEN']
# christmas = predict_games(home_teams_christmas, away_teams_christmas, games, model, scaler)
# christmas

## Today's Games

In [26]:
home_teams = ['DET', 'CHA', 'NYK', 'MIA', 'HOU', 'UTA', 'POR', 'SAC']
away_teams = ['SAS', 'ORL', 'DAL', 'GSW', 'ATL', 'MEM', 'CLE', 'OKC']
outcomes = predict_games(home_teams, away_teams, games, model, scaler)
outcomes

Unnamed: 0,home_team,home_prob,away_team,away_prob,winner
0,DET,62.0,SAS,38.0,DET
1,CHA,23.0,ORL,77.0,ORL
2,NYK,41.9,DAL,58.1,DAL
3,MIA,45.0,GSW,55.0,GSW
4,HOU,58.0,ATL,42.0,HOU
5,UTA,21.0,MEM,79.0,MEM
6,POR,65.0,CLE,35.0,POR
7,SAC,15.0,OKC,85.0,OKC


Overall record in predicting winner starting Jan 27, 2025: 148-95