# This notebook contains code for predictiing Play-Off using selected model from `model_selection.ipynb` file

## Extra class for colourful output

In [1]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

## Import data

The main dataset was loaded from [Kaggle](https://www.kaggle.com/datasets/nathanlauga/nba-games/data). However, I added 2 extra files with season schedule (`season_schedule.csv`) and Play-Off teams for given seasons (`play_off_teams.csv`).

In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import numpy as np
from typing import Tuple, List, Dict

df_games = pd.read_csv('data/games.csv').drop(["TEAM_ID_home", "TEAM_ID_away"], axis=1) # open data without 2 columns wit hteam IDs
df_games = df_games.loc[df_games["SEASON"] >= 2004] # store games starting from season 2004

scaler = MinMaxScaler()
cols_to_norm = ["HOME_TEAM_ID", "VISITOR_TEAM_ID"]
df_games.loc[:, cols_to_norm] = scaler.fit_transform(df_games.loc[:, cols_to_norm]) # scale team IDs
df_games

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-12-22,22200477,Final,0.103448,0.758621,2022,126.0,0.484,0.926,0.382,25.0,46.0,117.0,0.478,0.815,0.321,23.0,44.0,1
1,2022-12-22,22200478,Final,0.862069,0.931034,2022,120.0,0.488,0.952,0.457,16.0,40.0,112.0,0.561,0.765,0.333,20.0,37.0,1
2,2022-12-21,22200466,Final,0.068966,0.413793,2022,114.0,0.482,0.786,0.313,22.0,37.0,106.0,0.470,0.682,0.433,20.0,46.0,1
3,2022-12-21,22200467,Final,0.620690,0.965517,2022,113.0,0.441,0.909,0.297,27.0,49.0,93.0,0.392,0.735,0.261,15.0,46.0,1
4,2022-12-21,22200468,Final,0.000000,0.137931,2022,108.0,0.429,1.000,0.378,22.0,47.0,110.0,0.500,0.773,0.292,20.0,47.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26646,2014-10-06,11400007,Final,0.000000,0.103448,2014,93.0,0.419,0.821,0.421,24.0,50.0,87.0,0.366,0.643,0.375,17.0,43.0,1
26647,2014-10-06,11400004,Final,0.137931,0.931034,2014,81.0,0.338,0.719,0.381,18.0,40.0,85.0,0.411,0.636,0.267,17.0,47.0,0
26648,2014-10-06,11400005,Final,0.344828,0.206897,2014,98.0,0.448,0.682,0.500,29.0,45.0,95.0,0.387,0.659,0.500,19.0,43.0,1
26649,2014-10-05,11400002,Final,0.827586,0.724138,2014,99.0,0.440,0.771,0.333,21.0,30.0,94.0,0.469,0.725,0.385,18.0,45.0,1


#### Teams information

In [3]:
df_teams = pd.read_csv('data/teams.csv')
df_teams.loc[:, ["TEAM_ID"]] = scaler.fit_transform(df_teams.loc[:, ["TEAM_ID"]])
df_teams.head()

Unnamed: 0,LEAGUE_ID,TEAM_ID,MIN_YEAR,MAX_YEAR,ABBREVIATION,NICKNAME,YEARFOUNDED,CITY,ARENA,ARENACAPACITY,OWNER,GENERALMANAGER,HEADCOACH,DLEAGUEAFFILIATION
0,0,0.0,1949,2019,ATL,Hawks,1949,Atlanta,State Farm Arena,18729.0,Tony Ressler,Travis Schlenk,Lloyd Pierce,Erie Bayhawks
1,0,0.034483,1946,2019,BOS,Celtics,1946,Boston,TD Garden,18624.0,Wyc Grousbeck,Danny Ainge,Brad Stevens,Maine Red Claws
2,0,0.103448,2002,2019,NOP,Pelicans,2002,New Orleans,Smoothie King Center,,Tom Benson,Trajan Langdon,Alvin Gentry,No Affiliate
3,0,0.137931,1966,2019,CHI,Bulls,1966,Chicago,United Center,21711.0,Jerry Reinsdorf,Gar Forman,Jim Boylen,Windy City Bulls
4,0,0.172414,1980,2019,DAL,Mavericks,1980,Dallas,American Airlines Center,19200.0,Mark Cuban,Donnie Nelson,Rick Carlisle,Texas Legends


In [4]:
# which columns to use for predictions
useful_columns = ['HOME_TEAM_ID', 'VISITOR_TEAM_ID',
                  'FG_PCT_home', 'FT_PCT_home',
                  'FG3_PCT_home', 'AST_home', 'REB_home',
                  'FG_PCT_away', 'FT_PCT_away',
                  'FG3_PCT_away', 'AST_away', 'REB_away',
                  'HOME_TEAM_WINS']

#### Seasons dates

In [5]:
season_schedule = pd.read_csv("data/season_schedule.csv")
"""
SEASON: start year of the season, i.e. season 2004 is season 2004-2005
START: beginning date of regular mathces
END: beginning date of play-off
"""
season_schedule.head()

Unnamed: 0,SEASON,START,END
0,2004,2004-11-02,2005-04-23
1,2005,2005-11-01,2006-04-22
2,2006,2006-10-31,2007-04-21
3,2007,2007-10-30,2008-04-19
4,2008,2008-10-28,2009-04-18


### Train ML model for given season

In [6]:
def filter_season_games(season_year: int) -> pd.DataFrame:
    # This function takes season_year as an argument and returns allregular season games for that season
    season_start = season_schedule.loc[season_schedule["SEASON"] == season_year]["START"].iloc[0]
    season_end = season_schedule.loc[season_schedule["SEASON"] == season_year]["END"].iloc[0]
    
    return df_games.loc[(df_games["GAME_DATE_EST"] >= season_start)
                        & (df_games["GAME_DATE_EST"] < season_end),
                        useful_columns]

In [7]:
def prepare_model(season_year: int) -> LogisticRegression:
    # This function takes season_year as an argument and trains the model on data for that season
    scaler = MinMaxScaler()
    regular_season = filter_season_games(season_year) # get games for given season
    regular_season.loc[:, cols_to_norm] = scaler.fit_transform(regular_season.loc[:, cols_to_norm]) # scale specified columns
    # return regular_season
    X, y = regular_season.drop(["HOME_TEAM_WINS"], axis=1), regular_season["HOME_TEAM_WINS"] # split data into predictors and target

    # train model
    model = LogisticRegression(max_iter=1000)
    model.fit(X, y)

    return model, scaler


### Compute average data for each team for a gven season

In [8]:
def compute_statistics(season_year: int=2018) -> pd.DataFrame:
    # This function takes season_year and computes average data for each team
    if 2004 > season_year > 2022:
        print("NO DATA FOR THIS SEASON")
        return
    
    season_data = filter_season_games(season_year) # get games for given season
    
    teams_data = df_teams[["TEAM_ID", "ABBREVIATION"]] # extract team ID and abbreviation from other table 
    
    # columns to be extracted from season_data
    home_features = ['FG_PCT_home', 'FT_PCT_home',
                     'FG3_PCT_home', 'AST_home', 'REB_home']
    away_features = ['FG_PCT_away', 'FT_PCT_away',
                     'FG3_PCT_away', 'AST_away', 'REB_away']

    # for each team compute average values
    for team_row in teams_data.iterrows():
        idx, (team_id, _) = team_row[0], team_row[1].values

        # Get data when given team played at home
        home_data = season_data.loc[season_data["HOME_TEAM_ID"] == team_id, home_features]
        home_data = home_data.rename(columns={f: f[:-5] for f in home_features})

        # Get data when given team played away
        away_data = season_data.loc[season_data["VISITOR_TEAM_ID"] == team_id, away_features]
        away_data = away_data.rename(columns={f: f[:-5] for f in away_features})
        
        # Combine subtables
        temp_data = pd.concat([home_data, away_data])
        for col in temp_data.columns:
            teams_data.loc[idx, [col]] = temp_data.loc[:, [col]].mean()  
    
    return teams_data

print("This is the table with average performance of each team in given season")
compute_statistics().head()

This is the table with average performance of each team in given season


Unnamed: 0,TEAM_ID,ABBREVIATION,FG_PCT,FT_PCT,FG3_PCT,AST,REB
0,0.0,ATL,0.45178,0.751817,0.350976,25.829268,46.097561
1,0.034483,BOS,0.465829,0.806378,0.363012,26.280488,44.54878
2,0.103448,NOP,0.474561,0.757463,0.344073,27.02439,47.292683
3,0.137931,CHI,0.454183,0.788293,0.349183,21.902439,42.890244
4,0.172414,DAL,0.447659,0.741524,0.341451,23.390244,45.317073


### Simulate one play-off series for 2 given teams

In [9]:
home_visitor_order = [True, True, False, False, True, False, True]
scale = 1/4
random_percent_list = np.linspace(-0.3, 0.3, 100)
random_ast_reb_list = np.linspace(-10, 10, 100)

def simulate_play_off_round(team_pair: List[str],
                            ml_model: LogisticRegression,
                            season_statistics: pd.DataFrame,
                            scaler: MinMaxScaler) -> str:
    # This function takes team pair, pretrained model, stastics of each team for given season
    # and simulates play-off round for this pair (best in 7)
    
    np.random.shuffle(team_pair) # randomly set home and visitor teams
    team1, team2 = team_pair # team1 starts at home
    if team1 == team2:
        print("YOU ENTERED THE SAME TEAMS")
        return
    
    # extract corresponding average performance for each team
    team1_data = season_statistics.loc[season_statistics["ABBREVIATION"] == team1, :]
    team2_data = season_statistics.loc[season_statistics["ABBREVIATION"] == team2, :]
    
    # initialize columns of new small table
    home_team_id = [] # HOME_TEAM_ID
    away_team_id = [] # VISITOR_TEAM_ID

    fg_pct_home, ft_pct_home, fg3_pct_home = [], [], [] # FG_PCT_home, FT_PCT_home,	FG3_PCT_home
    ast_home, reb_home = [], [] # AST_home, REB_home

    fg_pct_away, ft_pct_away, fg3_pct_away = [], [], [] # FG_PCT_away, FT_PCT_away, FG3_PCT_away
    ast_away, reb_away = [], [] # AST_away, REB_away
    
    # switch home and visitor teams
    for hv in home_visitor_order:
        if hv:
            home_team = team1_data
            away_team = team2_data
        else:
            home_team = team2_data
            away_team = team1_data
        
        # append data in corrsponding columns and apply variability to the team statistics
        home_team_id.append(home_team["TEAM_ID"].iloc[0])
        fg_pct_home.append(min(home_team["FG_PCT"].iloc[0] + np.random.choice(random_percent_list), 1))
        ft_pct_home.append(min(home_team["FT_PCT"].iloc[0] + np.random.choice(random_percent_list), 1))
        fg3_pct_home.append(min(home_team["FG3_PCT"].iloc[0] + np.random.choice(random_percent_list), 1))
        ast_home.append(home_team["AST"].iloc[0] + np.random.choice(random_ast_reb_list))
        reb_home.append(home_team["REB"].iloc[0] + np.random.choice(random_ast_reb_list))
        
        away_team_id.append(away_team["TEAM_ID"].iloc[0])
        fg_pct_away.append(min(away_team["FG_PCT"].iloc[0] + np.random.choice(random_percent_list), 1))
        ft_pct_away.append(min(away_team["FT_PCT"].iloc[0] + np.random.choice(random_percent_list), 1))
        fg3_pct_away.append(min(away_team["FG3_PCT"].iloc[0] + np.random.choice(random_percent_list), 1))
        ast_away.append(away_team["AST"].iloc[0] + np.random.choice(random_ast_reb_list))
        reb_away.append(away_team["REB"].iloc[0] + np.random.choice(random_ast_reb_list))
    
    # merge data to create pandas.Dataframe
    cols = useful_columns
    data = {cols[0]: home_team_id,
            cols[1]: away_team_id,
            cols[2]: fg_pct_home,
            cols[3]: ft_pct_home,
            cols[4]: fg3_pct_home,
            cols[5]: ast_home,
            cols[6]: reb_home,
            cols[7]: fg_pct_away,
            cols[8]: ft_pct_away,
            cols[9]: fg3_pct_away,
            cols[10]: ast_away,
            cols[11]: reb_away}
    games = pd.DataFrame(data, columns=useful_columns[:-1])

    games.loc[:, cols_to_norm] = scaler.transform(games.loc[:, cols_to_norm])

    # predict games results
    results = ml_model.predict(games) 

    # iterate over predicted results and count number of victories for each team
    team1_wins, team2_wins = 0, 0
    for r, hv in zip(results, home_visitor_order):
        if r == 1:
            if hv:
                team1_wins += 1
            else:
                team2_wins += 1
        else:
            if hv:
                team2_wins += 1
            else:
                team1_wins += 1
        # if any of the teams has achieved 4 victories return its name
        if team1_wins == 4:
            return team1
        elif team2_wins == 4:
            return team2
    

In [10]:
num_of_sim = 101

def compute_probability_to_win_round(team_pair: List[str],
                                     model: LogisticRegression,
                                     season_statistics: pd.DataFrame,
                                     season_to_simulate: int,
                                     scaler: MinMaxScaler,
                                     display_results=True) -> Dict[str, float]:
    # This function simulates play-off round N times for given team pair
    # and returns team with more won rounds

    teams_results = {team: 0 for team in team_pair} # counter of won rounds for each team

    # predict winner N times and increment corresponding counter
    for _ in range(num_of_sim):
        winner = simulate_play_off_round(team_pair, model, season_statistics, scaler)
        teams_results[winner] += 1 / num_of_sim

    # Print results if requested
    if display_results:
        print(f"\nSeason {season_to_simulate}-{season_to_simulate+1}")
        for t,w in teams_results.items():
            print(f"{t} won {w*100:.1f}% of rounds")

    # Sort results by probability of a victory, so the first team in pair is winner            
    teams_results = dict(sorted(teams_results.items(), key=lambda item: item[1], reverse=True))
    return teams_results

# Simulate single play-off round for 2 teams in 2 different seasons
year_list = [2015, 2019]
team_pair = ["LAL", "ATL"]
for season_to_simulate in year_list:
    model, scaler = prepare_model(season_to_simulate)
    compute_probability_to_win_round(team_pair,
                                     model,
                                     compute_statistics(season_to_simulate),
                                     season_to_simulate,
                                     scaler)


Season 2015-2016
LAL won 7.9% of rounds
ATL won 92.1% of rounds

Season 2019-2020
ATL won 26.7% of rounds
LAL won 73.3% of rounds


#### We can see that in different seasons different teams were stronger. Let's compare winning percantage from real data

In [11]:
df_standing = pd.read_csv("data/ranking.csv") # read table with teams standing
df_standing = df_standing.loc[:, ["TEAM_ID", "STANDINGSDATE", "TEAM", "W_PCT"]] # keep only specific columns
df_standing.loc[:, ["TEAM_ID"]] = scaler.fit_transform(df_standing.loc[:, ["TEAM_ID"]]) # scale teams IDs
df_standing = pd.merge(df_standing, df_teams.loc[:, ["TEAM_ID", "ABBREVIATION"]], on="TEAM_ID") # add column with corresponding team abbreviation

print("Winning percentage")
# get team winning percentage for each season
for year in year_list:
    print(f"\nSeason {year}-{year+1}")
    season_end = season_schedule.loc[season_schedule["SEASON"] == year]["END"].iloc[0]
    cur_season_data = df_standing.loc[df_standing["STANDINGSDATE"] == season_end]

    for t in team_pair:
        w_pct = cur_season_data.loc[cur_season_data["ABBREVIATION"] == t]["W_PCT"].iloc[0]
        print(f"{t} won {w_pct*100:.1f}% of matches")


Winning percentage

Season 2015-2016
ATL won 58.5% of matches
LAL won 20.7% of matches

Season 2019-2020
ATL won 29.9% of matches
LAL won 74.3% of matches


#### We can see that in season 2015-2016 `LAL` won only 20% of their mathces while `ATL` won 58% percent of their matches, so probably `ATL` was stronger in that season, this proves our prediction above. However, in the season 2019-2020 winning percantage is other way around, which means `LAL` was stronger that season, as well as according to our prediction.

### Now let us simulate the entire play-off series for desired season

In [12]:
play_off_teams = pd.read_csv('data/play_off_teams.csv') # table with teams participating in play-off in each season
"""
SEASON: start year of the season, i.e. season 2004 is season 2004-2005
EAST_TEAMS: list of east teams participating in play-off
WEST_TEAMS: list of west teams participating in play-off
WINNER: team that became champion that season
RUNNER_UP: team that was in final round that season
"""
play_off_teams.head()

Unnamed: 0,SEASON,EAST_TEAMS,WEST_TEAMS,WINNER,RUNNER_UP
0,2004,"['DET', 'PHI', 'IND', 'BOS', 'MIA', 'BKN', 'WA...","['DAL', 'HOU', 'PHX', 'MEM', 'SAS', 'DEN', 'OK...",SAS,DET
1,2005,"['CLE', 'WAS', 'DET', 'MIL', 'MIA', 'CHI', 'BK...","['DAL', 'MEM', 'SAS', 'SAC', 'LAC', 'DEN', 'PH...",MIA,DAL
2,2006,"['CHI', 'MIA', 'DET', 'ORL', 'CLE', 'WAS', 'BK...","['GSW', 'DAL', 'UTA', 'HOU', 'PHX', 'LAL', 'SA...",SAS,CLE
3,2007,"['BOS', 'ATL', 'CLE', 'WAS', 'DET', 'PHI', 'OR...","['LAL', 'DEN', 'UTA', 'HOU', 'CHA', 'DAL', 'SA...",BOS,LAL
4,2008,"['ATL', 'MIA', 'CLE', 'DET', 'BOS', 'CHI', 'OR...","['DAL', 'SAS', 'DEN', 'CHA', 'HOU', 'POR', 'LA...",LAL,ORL


In [13]:
def simulate_entire_play_off(season: int,
                             east_teams: List[str],
                             west_teams: List[str],
                             real_winner: str,
                             runner_up: str,
                             display_min_results: bool=False,
                             display_all_results: bool=False,
                             return_data: bool=True) -> Tuple[str, str]:
    # This function simulates entire play-off for given qualified teams and prints results if requested
    
    best_model, scaler = prepare_model(season) # train model on data for given season
    season_statistics = compute_statistics(season) # compute average performance of each team in given season
    
    # Print results if requested
    if display_min_results or display_all_results:
        print(f"Simulating {season}-{season+1} Play-Off")
    if display_all_results:
        print("\nEAST:")
    
    # iterate over teams in east conference until one team remains
    while len(east_teams) > 1:
        east_results = [] # list to store mathces results
        # Take each pair
        for i in range(0, len(east_teams), 2):
            # simulate play-off round and store results
            east_results.append(compute_probability_to_win_round([east_teams[i], east_teams[i+1]], 
                                                                 best_model,
                                                                 season_statistics,
                                                                 season,
                                                                 scaler,
                                                                 display_results=False))
        # Update list of teams keeping only those that won previous round
        east_teams = [list(res.items())[0][0] for res in east_results]
        # Display results if requested
        if display_all_results:
            for i in range(len(east_results)):
                winner = list(east_results[i].items())[0]
                loser = list(east_results[i].items())[1]
                print(f"{color.GREEN}{winner[0]}{color.END}({winner[1]:.2f}) defeated {color.RED}{loser[0]}{color.END}({loser[1]:.2f})")
            print()
    
    # repeat procedure of west conference
    # Print results if requested
    if display_all_results:
        print("\nWEST:")
    # iterate over teams in west conference until one team remains
    while len(west_teams) > 1:
        west_results = [] # list to store mathces results
        # Take each pair
        for i in range(0, len(west_teams), 2):
            # simulate play-off round and store results
            west_results.append(compute_probability_to_win_round([west_teams[i], west_teams[i+1]], 
                                                                 best_model,
                                                                 season_statistics,
                                                                 season,
                                                                 scaler,
                                                                 display_results=False))
        # Update list of teams keeping only those that won previous round
        west_teams = [list(res.items())[0][0] for res in west_results]
        # Display results if requested
        if display_all_results:
            for i in range(len(west_results)):
                winner = list(west_results[i].items())[0]
                loser = list(west_results[i].items())[1]
                print(f"{color.GREEN}{winner[0]}{color.END}({winner[1]:.2f}) defeated {color.RED}{loser[0]}{color.END}({loser[1]:.2f})")
            print()
    
    # Simulate final round between winner of east and west conferences
    final_result = compute_probability_to_win_round([east_teams[0], west_teams[0]], best_model, season_statistics, season, scaler, display_results=False)
    # Store winner team and other finalist
    winner = list(final_result.items())[0]
    loser = list(final_result.items())[1]
    # Display results if requested
    if display_min_results or display_all_results:
        print(f"According to the prediction {color.GREEN}{winner[0]}{color.END}({winner[1]:.2f}) defeated {color.RED}{loser[0]}{color.END}({loser[1]:.2f}) in finals")
        print(f"While in real life {color.GREEN}{real_winner}{color.END} defeated {color.RED}{runner_up}{color.END} in finals\n")

    if return_data:
        return winner, loser

In [14]:
real_winners, real_losers = [], [] # lists to store actual winners and losers in final round
pred_winners, pred_losers = [], [] # lists to store predicted winners and losers in final round
incorrect_seasons = [] # list for seasons with wrong winner predictions
# Iterate over each season and simulate entire play-off
for row in tqdm(play_off_teams.iterrows()):
    # get data from table
    season_year, east_teams, west_teams, real_winner, real_loser = row[1].values

    # filter rows with team pairs from table
    east_teams = [t[1:-1] for t in east_teams[1: -1].split(', ')]
    west_teams = [t[1:-1] for t in west_teams[1: -1].split(', ')]

    # Save real winner and loser for current season
    real_winners.append(real_winner)
    real_losers.append(real_loser)

    # Simulate entire play-off and store predicted winner and loser
    pred_winner, pred_loser = simulate_entire_play_off(season_year, east_teams, west_teams, real_winner, real_loser)
    pred_winners.append(pred_winner[0])
    pred_losers.append(pred_loser[0])

18it [01:54,  6.34s/it]


#### Let us check how many correct results were given

In [15]:
# Compute accuracy for predicted winners and losers
print(f"Winners accuracy: {accuracy_score(real_winners, pred_winners)}")
print(f"Losers accuracy: {accuracy_score(real_losers, pred_losers)}")

Winners accuracy: 0.2777777777777778
Losers accuracy: 0.05555555555555555


## Conclusion:
We can see that only $\leq1/3$ of winners were predicted correctly, *i.e.* 6/18 champions, while other finalists were predicted even worse. Nevertheless, if we think about it, there are many variables that can affect match outcome that we didn't consider. For example, some players may be injured and not able to play, or lead player may be disqualified from the match because of high number of fouls and *etc.* This means, that desgined model should work correctly in ideal scenario, however, there are more parameters to be taken into account. Therefore, the model can be extended, since the downloaded dataset provides the information about each of the player in each game, hence we can add paramaters like injury or numbe of fouls to the model, which should potentially improve performance