### Import necessary libaries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from nba_api.stats.endpoints import leaguegamefinder

### Get game logs from NBA teams only using nba_api

In [4]:
games = leaguegamefinder.LeagueGameFinder().get_data_frames()[0]
nba_teams = ['ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']
games = games[games['TEAM_ABBREVIATION'].isin(nba_teams)]


In [5]:
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
2,22024,1610612744,GSW,Golden State Warriors,22400604,2025-01-20,GSW vs. BOS,L,240,85,...,0.636,10,28,38,28,8,3,12,12,-40.0
3,22024,1610612741,CHI,Chicago Bulls,22400611,2025-01-20,CHI @ LAC,W,240,112,...,0.808,17,37,54,25,7,6,10,17,13.0
4,22024,1610612756,PHX,Phoenix Suns,22400603,2025-01-20,PHX @ CLE,L,242,92,...,0.769,1,33,34,22,3,5,17,20,-26.0
5,22024,1610612742,DAL,Dallas Mavericks,22400599,2025-01-20,DAL @ CHA,L,240,105,...,0.885,16,35,51,19,10,8,14,15,-5.0
7,22024,1610612763,MEM,Memphis Grizzlies,22400601,2025-01-20,MEM vs. MIN,W,241,108,...,0.794,13,34,47,19,9,4,13,22,2.0


### Convert date to appropriate data type

In [6]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])
games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14639 entries, 2 to 29999
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   SEASON_ID          14639 non-null  object        
 1   TEAM_ID            14639 non-null  int64         
 2   TEAM_ABBREVIATION  14639 non-null  object        
 3   TEAM_NAME          14639 non-null  object        
 4   GAME_ID            14639 non-null  object        
 5   GAME_DATE          14639 non-null  datetime64[ns]
 6   MATCHUP            14639 non-null  object        
 7   WL                 14631 non-null  object        
 8   MIN                14639 non-null  int64         
 9   PTS                14639 non-null  int64         
 10  FGM                14639 non-null  int64         
 11  FGA                14639 non-null  int64         
 12  FG_PCT             14636 non-null  float64       
 13  FG3M               14639 non-null  int64         
 14  FG3A       

In [7]:
team_games = games[games['TEAM_ABBREVIATION'] == 'ATL']
recent_games = team_games.nlargest(10, 'GAME_DATE')

In [8]:
recent_games

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
21,22024,1610612737,ATL,Atlanta Hawks,22400602,2025-01-20,ATL @ NYK,L,239,110,...,0.773,16,25,41,23,10,6,23,21,-9.0
59,22024,1610612737,ATL,Atlanta Hawks,22400587,2025-01-18,ATL @ BOS,W,265,119,...,0.786,13,42,55,27,9,10,17,17,4.0
137,22024,1610612737,ATL,Atlanta Hawks,22400563,2025-01-15,ATL @ CHI,W,240,110,...,0.682,14,37,51,28,11,5,9,15,16.0
167,22024,1610612737,ATL,Atlanta Hawks,22400556,2025-01-14,ATL vs. PHX,W,241,122,...,0.731,20,34,54,22,10,3,12,21,5.0
298,22024,1610612737,ATL,Atlanta Hawks,22400522,2025-01-09,ATL @ PHX,L,239,115,...,0.941,8,24,32,32,11,3,11,19,-8.0
342,22024,1610612737,ATL,Atlanta Hawks,22400506,2025-01-07,ATL @ UTA,W,240,124,...,0.8,12,32,44,35,6,11,10,22,3.0
431,22024,1610612737,ATL,Atlanta Hawks,22400486,2025-01-04,ATL @ LAC,L,240,105,...,0.765,8,35,43,30,11,4,22,12,-26.0
478,22024,1610612737,ATL,Atlanta Hawks,22400477,2025-01-03,ATL @ LAL,L,240,102,...,0.778,13,26,39,22,10,3,8,22,-17.0
526,22024,1610612737,ATL,Atlanta Hawks,22400461,2025-01-01,ATL @ DEN,L,240,120,...,0.667,12,32,44,34,10,5,13,16,-19.0
588,22024,1610612737,ATL,Atlanta Hawks,22400438,2024-12-29,ATL @ TOR,W,240,136,...,0.743,14,26,40,30,22,8,14,19,29.0


### General structure of stats that I want to use for performance analysis

In [9]:
record = recent_games['WL'].value_counts().to_dict()
wins = record.get('W', 0)
losses = record.get('L', 0)

stats = {
        'avg_plus_minus': recent_games['PLUS_MINUS'].mean(),
        'avg_points': recent_games['PTS'].mean(),
        'avg_points_allowed': recent_games['PTS'].mean() - recent_games['PLUS_MINUS'].mean(),
        'avg_fg_pct': recent_games['FG_PCT'].mean().round(3),
        'avg_fg3_pct': recent_games['FG3_PCT'].mean().round(3),
        'avg_ast': recent_games['AST'].mean(),
        'record': f"{wins}-{losses}"
    }

In [10]:
stats

{'avg_plus_minus': -2.2,
 'avg_points': 116.3,
 'avg_points_allowed': 118.5,
 'avg_fg_pct': 0.454,
 'avg_fg3_pct': 0.354,
 'avg_ast': 28.3,
 'record': '5-5'}

### Functions that get all stats from last 10 games and then create an itemized stats dataframe for the respective team

In [11]:
def last_n_games(games_df, team_abbrev, n=10):
  team_games = games_df[games_df['TEAM_ABBREVIATION'] == team_abbrev]
  return team_games.nlargest(n, 'GAME_DATE')

def get_team_performance(games_df, team_abbrev, n=10):
  recent_games = last_n_games(games_df, team_abbrev, n)

  record = recent_games['WL'].value_counts().to_dict()
  wins = record.get('W', 0)
  losses = record.get('L', 0)

  stats = {
        'avg_plus_minus': round(recent_games['PLUS_MINUS'].mean(), 2),
        'avg_points': round(recent_games['PTS'].mean(), 2),
        'avg_points_allowed': round(recent_games['PTS'].mean() - recent_games['PLUS_MINUS'].mean(), 2),
        'avg_fg_pct': round(recent_games['FG_PCT'].mean() * 100, 2),
        'avg_fg3_pct': round(recent_games['FG3_PCT'].mean() * 100, 2),
        'avg_ast': round(recent_games['AST'].mean(), 2),
        'record': f"{wins}-{losses}"
    }

  return stats

### Function that compares the performance of two teams side by side

In [12]:
def compare_teams(games_df, team1_abbrev, team2_abbrev, n=10):
  team1_stats = get_team_performance(games_df, team1_abbrev, n)
  team2_stats = get_team_performance(games_df, team2_abbrev, n)

  comparison = pd.DataFrame({
    f'{team1_abbrev}': [
        team1_stats['avg_plus_minus'],
        team1_stats['avg_points'],
        team1_stats['avg_points_allowed'],
        team1_stats['avg_fg_pct'],
        team1_stats['avg_fg3_pct'],
        team1_stats['avg_ast'],
        team1_stats['record']
    ],
    f'{team2_abbrev}': [
        team2_stats['avg_plus_minus'],
        team2_stats['avg_points'],
        team2_stats['avg_points_allowed'],
        team2_stats['avg_fg_pct'],
        team2_stats['avg_fg3_pct'],
        team2_stats['avg_ast'],
        team2_stats['record']
    ]
    }, index=['Plus/Minus', 'Points', 'Points Allowed', 'FG%', '3P%', 'Assists', 'Record'])
  
  return comparison

In [13]:
comparison = compare_teams(games, 'OKC', 'BOS', 10)
comparison

Unnamed: 0,OKC,BOS
Plus/Minus,15.4,5.9
Points,119.9,111.2
Points Allowed,104.5,105.3
FG%,50.04,45.51
3P%,41.03,34.87
Assists,27.7,24.7
Record,8-2,6-4


In [14]:
team1_stats = get_team_performance(games, 'OKC', 10)
team2_stats = get_team_performance(games, 'BOS', 10)

In [15]:
features = pd.DataFrame({
  'pts_diff': [team1_stats['avg_points'] - team2_stats['avg_points']],
  'plus_minus_diff': [team1_stats['avg_plus_minus'] - team2_stats['avg_plus_minus']],
  'fg_pct_diff': [team1_stats['avg_fg_pct'] - team2_stats['avg_fg_pct']],
  'fg_pct3_diff': [team1_stats['avg_fg3_pct'] - team2_stats['avg_fg3_pct']],
  'ast_diff': [team1_stats['avg_ast'] - team2_stats['avg_ast']]
})

In [16]:
features

Unnamed: 0,pts_diff,plus_minus_diff,fg_pct_diff,fg_pct3_diff,ast_diff
0,8.7,9.5,4.53,6.16,3.0


### Function that prepares the features for two team comparison

In [17]:
def prepare_ml_features(games_df, team1, team2, n_games=10):
    """Prepare features for ML model from team comparison stats"""
    team1_stats = get_team_performance(games_df, team1, n_games)
    team2_stats = get_team_performance(games_df, team2, n_games)
    
    features = pd.DataFrame({
        'pts_diff': [team1_stats['avg_points'] - team2_stats['avg_points']],
        'plus_minus_diff': [team1_stats['avg_plus_minus'] - team2_stats['avg_plus_minus']],
        'fg_pct_diff': [team1_stats['avg_fg_pct'] - team2_stats['avg_fg_pct']],
        'fg3_pct_diff': [team1_stats['avg_fg3_pct'] - team2_stats['avg_fg3_pct']],
        'ast_diff': [team1_stats['avg_ast'] - team2_stats['avg_ast']]
    })
    
    return features

### Function to train the model based on historical data (365 days)

In [18]:
def train_model(games_df, train_period_days=365):
    """Train ML model on historical matchups"""
    cutoff_date = games_df['GAME_DATE'].max() - pd.Timedelta(days=train_period_days)
    recent_games = games_df[games_df['GAME_DATE'] >= cutoff_date]
    
    features_list = []
    results = []
 
    unique_games = recent_games.drop_duplicates(subset=['GAME_ID'])
    
    for _, game in unique_games.iterrows():
        home_team = game['TEAM_ABBREVIATION']
        away_team_row = recent_games[
            (recent_games['GAME_ID'] == game['GAME_ID']) & 
            (recent_games['TEAM_ABBREVIATION'] != home_team)
        ]
        if away_team_row.empty:
            continue
        away_team = away_team_row['TEAM_ABBREVIATION'].iloc[0]

        historical_games = games_df[games_df['GAME_DATE'] < game['GAME_DATE']]
        if len(historical_games) < 10: 
            continue
            
        features = prepare_ml_features(historical_games, home_team, away_team)
        features_list.append(features)

        result = 1 if game['WL'] == 'W' else 0
        results.append(result)
    
    X = pd.concat(features_list)
    y = pd.Series(results)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_scaled, y)
    
    return model, scaler

### Function that predicts probability of a win for a given team against given opponent

In [19]:
def predict_game(games_df, team1, team2, model, scaler):
    """Predict outcome of game between two teams"""
    features = prepare_ml_features(games_df, team1, team2)
    features_scaled = scaler.transform(features)
    
    win_prob = model.predict_proba(features_scaled)[0][1]
    
    return win_prob

In [20]:
model, scaler = train_model(games)
win_probability = predict_game(games, 'BOS', 'OKC', model, scaler)
print(f"Win probability for BOS: {win_probability:.1%}")

Win probability for BOS: 13.0%


### Function that gives the probabilities for both teams to use for presenting game outcome

In [21]:
def predict_both_teams(games, team1, team2, model, scaler):
    """
    Predicts the win probabilities for both teams in a game.

    Parameters:
        games (DataFrame): The dataset containing game data.
        team1 (str): The first team's abbreviation.
        team2 (str): The second team's abbreviation.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        tuple: Win probabilities for team1 and team2 as percentages.
    """
    prob_team1 = predict_game(games, team1, team2, model, scaler)
    prob_team2 = 1 - prob_team1

    return prob_team1 * 100, prob_team2 * 100

### Function to predict winner of game

In [22]:
def determine_winner(games, team1, team2, model, scaler):
    """
    Determines the winner based on the win probabilities of two teams.

    Parameters:
        games (DataFrame): The dataset containing game data.
        team1 (str): The first team's abbreviation.
        team2 (str): The second team's abbreviation.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        dict: A dictionary with win probabilities for both teams and the predicted winner.
    """
    prob_team1, prob_team2 = predict_both_teams(games, team1, team2, model, scaler)
    
    if prob_team1 > prob_team2:
        winner = team1
    elif prob_team2 > prob_team1:
        winner = team2
    else:
        winner = np.random.choice([team1, team2])  # In case the probabilities are exactly equal

    return {
        "team1": team1,
        "team1_prob": prob_team1,
        "team2": team2,
        "team2_prob": prob_team2,
        "winner": winner
    }

In [23]:
model, scaler = train_model(games)
result = determine_winner(games, 'BOS', 'OKC', model, scaler)

print(f"Win probability for {result['team1']}: {result['team1_prob']:.1f}%")
print(f"Win probability for {result['team2']}: {result['team2_prob']:.1f}%")
print(f"Predicted winner: {result['winner']}")

Win probability for BOS: 13.0%
Win probability for OKC: 87.0%
Predicted winner: OKC


In [24]:
# 12/22 games
home_teams = ['SAC', 'TOR', 'NOP']
away_teams = ['IND', 'HOU', 'DEN']


### Function that goes through game slate for a given day

In [25]:
def predict_games(home_teams, away_teams, games, model, scaler):
    """
    Predicts the winners for a list of home and away teams.

    Parameters:
        home_teams (list): List of home teams.
        away_teams (list): List of away teams.
        games (DataFrame): The dataset containing game data.
        model: The trained model for predicting win probabilities.
        scaler: The scaler used to normalize the data.

    Returns:
        DataFrame: A DataFrame with the game predictions and predicted winners.
    """
    results = []

    for home, away in zip(home_teams, away_teams):
        prediction = determine_winner(games, home, away, model, scaler)
        
        results.append({
            "home_team": home,
            "home_prob": prediction["team1_prob"],
            "away_team": away,
            "away_prob": prediction["team2_prob"],
            "winner": prediction["winner"]
        })
    
    results_df = pd.DataFrame(results)
    return results_df

# Example
home_teams = ['OKC', 'LAL', 'PHI', 'ORL', 'CLE', 'CHA', 'NYK', 'MIA', 'ATL', 'MEM', 'CHI', 'DAL', 'GSW', 'DEN']
away_teams = ['WAS', 'DET', 'SAS', 'BOS', 'UTA', 'HOU', 'TOR', 'BKN', 'MIN', 'LAC', 'MIL', 'POR', 'IND', 'PHX']
model, scaler = train_model(games)

todays_games = predict_games(home_teams, away_teams, games, model, scaler)

In [26]:
# 12/23 games
todays_games

Unnamed: 0,home_team,home_prob,away_team,away_prob,winner
0,OKC,70.0,WAS,30.0,OKC
1,LAL,56.0,DET,44.0,LAL
2,PHI,45.0,SAS,55.0,SAS
3,ORL,14.0,BOS,86.0,BOS
4,CLE,27.0,UTA,73.0,UTA
5,CHA,36.0,HOU,64.0,HOU
6,NYK,25.0,TOR,75.0,TOR
7,MIA,70.0,BKN,30.0,MIA
8,ATL,44.0,MIN,56.0,MIN
9,MEM,69.0,LAC,31.0,MEM


### Predicting the outcome of the Christmas slate

In [27]:
# # Christmas games
# home_teams_christmas = ['NYK', 'DAL', 'BOS', 'GSW', 'PHX']
# away_teams_christmas = ['SAS', 'MIN', 'PHI', 'LAL', 'DEN']
# christmas = predict_games(home_teams_christmas, away_teams_christmas, games, model, scaler)
# christmas

## Today's Games

In [29]:
home_teams = ['OKC', 'LAL', 'IND', 'ORL', 'MIL', 'ATL', 'DEN', 'GSW', 'LAC']
away_teams = ['DAL', 'BOS', 'SAS', 'POR', 'MIA', 'TOR', 'SAC', 'CHI', 'WAS']
outcomes = predict_games(home_teams, away_teams, games, model, scaler)
outcomes

Unnamed: 0,home_team,home_prob,away_team,away_prob,winner
0,OKC,39.0,DAL,61.0,DAL
1,LAL,47.0,BOS,53.0,BOS
2,IND,80.0,SAS,20.0,IND
3,ORL,59.0,POR,41.0,ORL
4,MIL,66.0,MIA,34.0,MIL
5,ATL,64.0,TOR,36.0,ATL
6,DEN,46.0,SAC,54.0,SAC
7,GSW,34.0,CHI,66.0,CHI
8,LAC,57.0,WAS,43.0,LAC
