### Import necessary libaries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from nba_api.stats.endpoints import leaguegamefinder

### Get game logs from NBA teams only using nba_api

In [2]:
games = leaguegamefinder.LeagueGameFinder().get_data_frames()[0]
nba_teams = ['ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']
games = games[games['TEAM_ABBREVIATION'].isin(nba_teams)]


In [3]:
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22024,1610612745,HOU,Houston Rockets,22400391,2024-12-23,HOU @ CHA,W,241,114,...,0.789,21,39,60,23,4,8,11,16,13.0
1,22024,1610612743,DEN,Denver Nuggets,22400402,2024-12-23,DEN vs. PHX,W,241,117,...,0.818,11,35,46,35,6,4,12,15,27.0
2,22024,1610612738,BOS,Boston Celtics,22400393,2024-12-23,BOS @ ORL,L,239,104,...,0.815,8,33,41,13,13,5,18,16,-4.0
3,22024,1610612761,TOR,Toronto Raptors,22400397,2024-12-23,TOR @ NYK,L,239,125,...,0.811,4,26,30,28,6,3,15,25,-14.0
4,22024,1610612739,CLE,Cleveland Cavaliers,22400392,2024-12-23,CLE vs. UTA,W,241,124,...,0.783,10,30,40,33,10,3,9,23,11.0


### Convert date to appropriate data type

In [4]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])
games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14634 entries, 0 to 29999
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   SEASON_ID          14634 non-null  object        
 1   TEAM_ID            14634 non-null  int64         
 2   TEAM_ABBREVIATION  14634 non-null  object        
 3   TEAM_NAME          14634 non-null  object        
 4   GAME_ID            14634 non-null  object        
 5   GAME_DATE          14634 non-null  datetime64[ns]
 6   MATCHUP            14634 non-null  object        
 7   WL                 14626 non-null  object        
 8   MIN                14634 non-null  int64         
 9   PTS                14634 non-null  int64         
 10  FGM                14634 non-null  int64         
 11  FGA                14634 non-null  int64         
 12  FG_PCT             14631 non-null  float64       
 13  FG3M               14634 non-null  int64         
 14  FG3A       

In [5]:
team_games = games[games['TEAM_ABBREVIATION'] == 'ATL']
recent_games = team_games.nlargest(10, 'GAME_DATE')

In [6]:
recent_games

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
15,22024,1610612737,ATL,Atlanta Hawks,22400395,2024-12-23,ATL vs. MIN,W,239,117,...,0.808,6,32,38,27,15,6,21,14,13.0
66,22024,1610612737,ATL,Atlanta Hawks,22400378,2024-12-21,ATL vs. MEM,L,239,112,...,0.833,6,35,41,29,11,6,25,14,-16.0
112,22024,1610612737,ATL,Atlanta Hawks,22400370,2024-12-19,ATL @ SAS,L,266,126,...,0.857,12,41,53,32,14,6,22,22,-7.0
207,22024,1610612737,ATL,Atlanta Hawks,22401229,2024-12-14,ATL @ MIL,L,241,102,...,0.647,7,37,44,23,3,6,13,20,-8.0
270,22024,1610612737,ATL,Atlanta Hawks,22401202,2024-12-11,ATL @ NYK,W,239,108,...,0.65,22,36,58,27,5,7,10,13,8.0
339,22024,1610612737,ATL,Atlanta Hawks,22400350,2024-12-08,ATL vs. DEN,L,239,111,...,0.741,19,26,45,26,12,2,12,22,-30.0
396,22024,1610612737,ATL,Atlanta Hawks,22400334,2024-12-06,ATL vs. LAL,W,265,134,...,0.773,11,37,48,40,10,5,17,24,2.0
437,22024,1610612737,ATL,Atlanta Hawks,22400323,2024-12-04,ATL @ MIL,W,240,119,...,0.909,15,39,54,29,16,3,11,25,15.0
487,22024,1610612737,ATL,Atlanta Hawks,22400315,2024-12-02,ATL vs. NOP,W,241,124,...,0.8,11,40,51,30,12,4,18,15,12.0
552,22024,1610612737,ATL,Atlanta Hawks,22400300,2024-11-30,ATL @ CHA,W,240,107,...,0.778,14,35,49,27,8,7,20,15,3.0


### General structure of stats that I want to use for performance analysis

In [7]:
record = recent_games['WL'].value_counts().to_dict()
wins = record.get('W', 0)
losses = record.get('L', 0)

stats = {
        'avg_plus_minus': recent_games['PLUS_MINUS'].mean(),
        'avg_points': recent_games['PTS'].mean(),
        'avg_points_allowed': recent_games['PTS'].mean() - recent_games['PLUS_MINUS'].mean(),
        'avg_fg_pct': recent_games['FG_PCT'].mean().round(3),
        'avg_fg3_pct': recent_games['FG3_PCT'].mean().round(3),
        'avg_ast': recent_games['AST'].mean(),
        'record': f"{wins}-{losses}"
    }

In [8]:
stats

{'avg_plus_minus': -0.8,
 'avg_points': 116.0,
 'avg_points_allowed': 116.8,
 'avg_fg_pct': 0.468,
 'avg_fg3_pct': 0.349,
 'avg_ast': 29.0,
 'record': '6-4'}

### Functions that get all stats from last 10 games and then create an itemized stats dataframe for the respective team

In [9]:
def last_n_games(games_df, team_abbrev, n=10):
  team_games = games_df[games_df['TEAM_ABBREVIATION'] == team_abbrev]
  return team_games.nlargest(n, 'GAME_DATE')

def get_team_performance(games_df, team_abbrev, n=10):
  recent_games = last_n_games(games_df, team_abbrev, n)

  record = recent_games['WL'].value_counts().to_dict()
  wins = record.get('W', 0)
  losses = record.get('L', 0)

  stats = {
        'avg_plus_minus': round(recent_games['PLUS_MINUS'].mean(), 2),
        'avg_points': round(recent_games['PTS'].mean(), 2),
        'avg_points_allowed': round(recent_games['PTS'].mean() - recent_games['PLUS_MINUS'].mean(), 2),
        'avg_fg_pct': round(recent_games['FG_PCT'].mean() * 100, 2),
        'avg_fg3_pct': round(recent_games['FG3_PCT'].mean() * 100, 2),
        'avg_ast': round(recent_games['AST'].mean(), 2),
        'record': f"{wins}-{losses}"
    }

  return stats

### Function that compares the performance of two teams side by side

In [10]:
def compare_teams(games_df, team1_abbrev, team2_abbrev, n=10):
  team1_stats = get_team_performance(games_df, team1_abbrev, n)
  team2_stats = get_team_performance(games_df, team2_abbrev, n)

  comparison = pd.DataFrame({
    f'{team1_abbrev}': [
        team1_stats['avg_plus_minus'],
        team1_stats['avg_points'],
        team1_stats['avg_points_allowed'],
        team1_stats['avg_fg_pct'],
        team1_stats['avg_fg3_pct'],
        team1_stats['avg_ast'],
        team1_stats['record']
    ],
    f'{team2_abbrev}': [
        team2_stats['avg_plus_minus'],
        team2_stats['avg_points'],
        team2_stats['avg_points_allowed'],
        team2_stats['avg_fg_pct'],
        team2_stats['avg_fg3_pct'],
        team2_stats['avg_ast'],
        team2_stats['record']
    ]
    }, index=['Plus/Minus', 'Points', 'Points Allowed', 'FG%', '3P%', 'Assists', 'Record'])
  
  return comparison

In [11]:
comparison = compare_teams(games, 'OKC', 'WAS', 10)
comparison

Unnamed: 0,OKC,WAS
Plus/Minus,11.5,-14.0
Points,113.9,106.8
Points Allowed,102.4,120.8
FG%,45.96,43.8
3P%,33.53,32.19
Assists,25.0,24.2
Record,8-2,2-8


In [12]:
team1_stats = get_team_performance(games, 'OKC', 10)
team2_stats = get_team_performance(games, 'WAS', 10)

In [13]:
features = pd.DataFrame({
  'pts_diff': [team1_stats['avg_points'] - team2_stats['avg_points']],
  'plus_minus_diff': [team1_stats['avg_plus_minus'] - team2_stats['avg_plus_minus']],
  'fg_pct_diff': [team1_stats['avg_fg_pct'] - team2_stats['avg_fg_pct']],
  'fg_pct3_diff': [team1_stats['avg_fg3_pct'] - team2_stats['avg_fg3_pct']],
  'ast_diff': [team1_stats['avg_ast'] - team2_stats['avg_ast']]
})

In [14]:
features

Unnamed: 0,pts_diff,plus_minus_diff,fg_pct_diff,fg_pct3_diff,ast_diff
0,7.1,25.5,2.16,1.34,0.8


### Function that prepares the features for two team comparison

In [15]:
def prepare_ml_features(games_df, team1, team2, n_games=10):
    """Prepare features for ML model from team comparison stats"""
    team1_stats = get_team_performance(games_df, team1, n_games)
    team2_stats = get_team_performance(games_df, team2, n_games)
    
    features = pd.DataFrame({
        'pts_diff': [team1_stats['avg_points'] - team2_stats['avg_points']],
        'plus_minus_diff': [team1_stats['avg_plus_minus'] - team2_stats['avg_plus_minus']],
        'fg_pct_diff': [team1_stats['avg_fg_pct'] - team2_stats['avg_fg_pct']],
        'fg3_pct_diff': [team1_stats['avg_fg3_pct'] - team2_stats['avg_fg3_pct']],
        'ast_diff': [team1_stats['avg_ast'] - team2_stats['avg_ast']]
    })
    
    return features

### Function to train the model based on historical data (365 days)

In [16]:
def train_model(games_df, train_period_days=365):
    """Train ML model on historical matchups"""
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.preprocessing import StandardScaler
    
    cutoff_date = games_df['GAME_DATE'].max() - pd.Timedelta(days=train_period_days)
    recent_games = games_df[games_df['GAME_DATE'] >= cutoff_date]
    
    features_list = []
    results = []
 
    unique_games = recent_games.drop_duplicates(subset=['GAME_ID'])
    
    for _, game in unique_games.iterrows():
        home_team = game['TEAM_ABBREVIATION']
        away_team_row = recent_games[
            (recent_games['GAME_ID'] == game['GAME_ID']) & 
            (recent_games['TEAM_ABBREVIATION'] != home_team)
        ]
        if away_team_row.empty:
            continue
        away_team = away_team_row['TEAM_ABBREVIATION'].iloc[0]

        historical_games = games_df[games_df['GAME_DATE'] < game['GAME_DATE']]
        if len(historical_games) < 10: 
            continue
            
        features = prepare_ml_features(historical_games, home_team, away_team)
        features_list.append(features)

        result = 1 if game['WL'] == 'W' else 0
        results.append(result)
    
    X = pd.concat(features_list)
    y = pd.Series(results)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_scaled, y)
    
    return model, scaler

### Function that predicts probability of a win for a given team against given opponent

In [17]:
def predict_game(games_df, team1, team2, model, scaler):
    """Predict outcome of game between two teams"""
    features = prepare_ml_features(games_df, team1, team2)
    features_scaled = scaler.transform(features)
    
    win_prob = model.predict_proba(features_scaled)[0][1]
    
    return win_prob

In [18]:
model, scaler = train_model(games)
win_probability = predict_game(games, 'WAS', 'OKC', model, scaler)
print(f"Win probability for WAS: {win_probability:.1%}")

Win probability for WAS: 21.0%


### Function that gives the probabilities for both teams to use for presenting game outcome

In [19]:
def predict_both_teams(games, team1, team2, model, scaler):
    """
    Predicts the win probabilities for both teams in a game.

    Parameters:
        games (DataFrame): The dataset containing game data.
        team1 (str): The first team's abbreviation.
        team2 (str): The second team's abbreviation.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        tuple: Win probabilities for team1 and team2 as percentages.
    """
    prob_team1 = predict_game(games, team1, team2, model, scaler)
    prob_team2 = 1 - prob_team1

    return prob_team1 * 100, prob_team2 * 100

### Function to predict winner of game

In [20]:
def determine_winner(games, team1, team2, model, scaler):
    """
    Determines the winner based on the win probabilities of two teams.

    Parameters:
        games (DataFrame): The dataset containing game data.
        team1 (str): The first team's abbreviation.
        team2 (str): The second team's abbreviation.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        dict: A dictionary with win probabilities for both teams and the predicted winner.
    """
    prob_team1, prob_team2 = predict_both_teams(games, team1, team2, model, scaler)
    
    if prob_team1 > prob_team2:
        winner = team1
    elif prob_team2 > prob_team1:
        winner = team2
    else:
        winner = np.random.choice([team1, team2])  # In case the probabilities are exactly equal

    return {
        "team1": team1,
        "team1_prob": prob_team1,
        "team2": team2,
        "team2_prob": prob_team2,
        "winner": winner
    }

In [21]:
model, scaler = train_model(games)
result = determine_winner(games, 'WAS', 'OKC', model, scaler)

print(f"Win probability for {result['team1']}: {result['team1_prob']:.1f}%")
print(f"Win probability for {result['team2']}: {result['team2_prob']:.1f}%")
print(f"Predicted winner: {result['winner']}")

Win probability for WAS: 21.0%
Win probability for OKC: 79.0%
Predicted winner: OKC


In [22]:
# 12/22 games
home_teams = ['SAC', 'TOR', 'NOP']
away_teams = ['IND', 'HOU', 'DEN']


### Function that goes through game slate for a given day

In [23]:
def predict_games(home_teams, away_teams, games, model, scaler):
    """
    Predicts the winners for a list of home and away teams.

    Parameters:
        home_teams (list): List of home teams.
        away_teams (list): List of away teams.
        games (DataFrame): The dataset containing game data.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        DataFrame: A DataFrame with the game predictions and predicted winners.
    """
    results = []

    for home, away in zip(home_teams, away_teams):
        prediction = determine_winner(games, home, away, model, scaler)
        
        results.append({
            "home_team": home,
            "home_prob": prediction["team1_prob"],
            "away_team": away,
            "away_prob": prediction["team2_prob"],
            "winner": prediction["winner"]
        })
    
    results_df = pd.DataFrame(results)
    return results_df

# Example
home_teams = ['OKC', 'LAL', 'PHI', 'ORL', 'CLE', 'CHA', 'NYK', 'MIA', 'ATL', 'MEM', 'CHI', 'DAL', 'GSW', 'DEN']
away_teams = ['WAS', 'DET', 'SAS', 'BOS', 'UTA', 'HOU', 'TOR', 'BKN', 'MIN', 'LAC', 'MIL', 'POR', 'IND', 'PHX']
model, scaler = train_model(games)

todays_games = predict_games(home_teams, away_teams, games, model, scaler)

### Predicting the outcome of the Christmas slate

In [26]:
# Christmas games
home_teams_christmas = ['NYK', 'DAL', 'BOS', 'GSW', 'PHX']
away_teams_christmas = ['SAS', 'MIN', 'PHI', 'LAL', 'DEN']
christmas = predict_games(home_teams_christmas, away_teams_christmas, games, model, scaler)
christmas

Unnamed: 0,home_team,home_prob,away_team,away_prob,winner
0,NYK,55.0,SAS,45.0,NYK
1,DAL,59.0,MIN,41.0,DAL
2,BOS,72.0,PHI,28.0,BOS
3,GSW,34.0,LAL,66.0,LAL
4,PHX,37.0,DEN,63.0,DEN
