In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn import svm # for Discriminator
from sklearn.model_selection import train_test_split # for train-test split 
from sklearn.preprocessing import StandardScaler # for feature scaling
from sklearn.model_selection import GridSearchCV # for fine-tuning
from sklearn.metrics import make_scorer, balanced_accuracy_score # for evaluation
from sklearn.pipeline import make_pipeline # for prediction

In [None]:
# for Generator
from scipy import stats # for sampling
from fitter import Fitter # for fitting the best distribution
import copy # for copying nested dictionaries

In [None]:
import matplotlib.pyplot as plt  # for visualization 
import seaborn as sns  # for coloring 

# set style of graphs
plt.style.use('dark_background')
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5

# Predicting NBA playoff results with simulation

## Background:

1. NBA playoff is a best-of-seven elimination tournament.
2. There are 16 competing teams; 8 initial pairs.
3. Each pair will play up-to 7 games, so it takes 4 games to win.
4. A total of 4 rounds are required to win the championship 🏆.

## Our Goal: 

Our goal is to predict the probability of winning for each team and at each round of the playoff, in order to see who may win the championship.

## Workflow:

1. Data cleaning
2. Fit a win/loss Discriminator(D)
3. Fit a feature Generator(G)
4. Run n_simulations with both D and G according to the rules of playoff
5. Visualization

# 1. Data Cleaning

In [None]:
df = pd.read_csv("/kaggle/input/nba-games/games.csv")
df.head() 

In [None]:
# sort dataframe by date
df = df.sort_values(by='GAME_DATE_EST').reset_index(drop = True)
# drop empty entries, data before 2004 contains NaN
df = df.loc[df['GAME_DATE_EST'] >= "2004-01-01"].reset_index(drop=True)
# check null
df.isnull().values.any() 

In [None]:
# replace Team ID with Names
df_names = pd.read_csv('/kaggle/input/nba-games/teams.csv')
df_names.head()

In [None]:
# We have two columns to replace, there are 'HOME_TEAM_ID' and 'VISITOR_TEAM_ID'

df_names = df_names[['TEAM_ID', 'NICKNAME']]

# replace 'HOME_TEAM_ID' with names in df_names
home_names = df_names.copy() # copy the names data
home_names.columns = ['HOME_TEAM_ID', 'NICKNAME'] # change the column names before merging
# merge names according to df on "ID"
result_1 = pd.merge(df['HOME_TEAM_ID'], home_names, how ="left", on="HOME_TEAM_ID")  
df['HOME_TEAM_ID'] = result_1['NICKNAME']

# replace 'VISITOR_TEAM_ID' with names in df_names
visitor_names = df_names.copy() # copy the names data
visitor_names.columns = ['VISITOR_TEAM_ID', 'NICKNAME'] # change the column names before merging
# merge names according to df on "ID"
result_2 = pd.merge(df['VISITOR_TEAM_ID'], visitor_names, how = "left", on="VISITOR_TEAM_ID")
df['VISITOR_TEAM_ID'] = result_2['NICKNAME']

In [None]:
# final dataframe
df.head()

# 2. Fitting an SVM win/loss discriminator

In [None]:
# we want to try and predict the 2019-2020 NBA play off results starting 2020-08 >
# hence, this portion of the data is held out
df = df.loc[df['GAME_DATE_EST'] < '2020-08-01'].reset_index(drop=True)

In [None]:
feature_list = list(df.columns)
feature_list

In [None]:
#✍To predict win/loss of a game, we can use one of the two ways:

#1. Select only one feature (points), the win/loss prediction is just based on which team has the higher point.
#2. Select features other than points, the win/loss is then based on the prediction of a classifier which takes those features as inputs.

# In this notebook, we will use option (2) as it offers better range of uncertainty for simulation.

selected_features = [
    'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home',
    'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away',
    ]

# check the features we selected
X = df[selected_features]
X.head()

In [None]:
# check the targets
y = df['HOME_TEAM_WINS']
y.head()

In [None]:
# turn them into numpy arrays for training

X = X.to_numpy()
y = y.to_numpy()

## Fitting SVM 

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3, random_state=42)

print("X shape", X_train.shape, "y shape", y_train.shape)

In [None]:
# feature scaling

scaler = StandardScaler() # initialize an instance 
X_train = scaler.fit_transform(X_train) 

In [None]:
%%time 

# train SVM

clf = svm.SVC(kernel='linear') # initialize a model
clf.fit(X_train, y_train) # fit(train) it with the training data and targets

# check test score 
y_pred = clf.predict(X_test) 
print('balanced accuracy score:', balanced_accuracy_score(y_test, y_pred)) 

In [None]:
%%time 

# fine-tuning hyperparameters

scoring = make_scorer(balanced_accuracy_score)
param_grid = {'C': [0.1, 1, 10],  
              'gamma': [1,0.1,0.01]} #

grid = GridSearchCV(svm.SVC(kernel='linear'), param_grid, scoring = scoring, refit=True, verbose=2) 
grid.fit(X_train, y_train)

In [None]:
# print the best model's hyperparameters
Dis = grid.best_estimator_
print(Dis) 

# 3. Fitting a Generator

In [None]:
# Like before, we had held out data from 2019-2020 playoff for real testing
# Though large data is essential for fitting, for time-series problems, we give priority to the recent data most reflective of team's recent ability.
# Since we aim to predict 2019-2020 playoff, here we will just fit the data from that regular session which starts in Oct, 2019.

df_ = df.loc[df['GAME_DATE_EST'] > '2019-10-01'].reset_index(drop=True)
df_.head()

In [None]:
# here we define the list of common distributions for fitting
# use more distributions with fewer data points, because the distribution will be less "normal"

selected_distributions = [
    'norm','t', 'f', 'chi', 'cosine', 'alpha', 
    'beta', 'gamma', 'dgamma', 'dweibull',
    'maxwell', 'pareto', 'fisk']


In [None]:
unique_teams = df['HOME_TEAM_ID'].unique() # extract all the unique teams

# Since we don't care about whether the team was a host or visitor in each game, 
# we can just combine the features for all games.

# Get all the data for teams
all_team_sim_data = {}

for team_name in unique_teams:
    
    # find games where the team is either the host or guest
    df_team = df_.loc[(df_['HOME_TEAM_ID'] == team_name) | (df_['VISITOR_TEAM_ID'] == team_name)]
    # it is home team, select the first 5 features
    df_1 = df_team.loc[df_team['HOME_TEAM_ID'] == team_name][selected_features[:5]]
    # it is guest team, select the first 5 features
    df_0 = df_team.loc[df_team['VISITOR_TEAM_ID'] == team_name][selected_features[5:]]

    # combine them
    df_0.columns = df_1.columns # before concating, match the column names
    df_s = pd.concat([df_1, df_0], axis = 0)
    
    # convert the pandas.DataFrame to numpy array
    all_team_sim_data[team_name] = df_s.to_numpy()

In [None]:
%%time

# data format:
#   team_name => list of feature distributions => distionary with distribution name and parameters
#   e.g.,
#   megadata = {
      #'Timberwolves': [{'beta': (0.23, 0.3, 0.3, 0.4)}, {'nor': (0.23, 0.3,)}, ..], 
      #'Warriors':[{}, {},...]
      #  }
    
megadata = {} # store the data that our Generator will rely on
for team_name in unique_teams:
    
    feature_dis_paras = []
    data = all_team_sim_data[team_name]
    
    # 5 features for each team
    for i in range(5): 
        f = Fitter(data[:, i]) # initalize a Fitter instance
        f.distributions = selected_distributions # use only the selected distributions (faster)
        f.fit() # do the fitting 
        best_paras = f.get_best(method='sumsquare_error') # get the best fitted paras
        feature_dis_paras.append(best_paras)
        
    megadata[team_name] = feature_dis_paras
    
print('Features for all teams have been fitted!')

# 4. Simulation

In [None]:

DATA = megadata.copy() # data that Generator must rely on

GEN = {
 'alpha': stats.alpha.rvs,
 'beta': stats.beta.rvs,
 'chi': stats.chi.rvs,
 'cosine': stats.cosine.rvs,
 'dgamma': stats.dgamma.rvs,
 'dweibull':stats.dweibull.rvs,
 'f':stats.f.rvs,
 'fisk':stats.fisk.rvs,
 'gamma': stats.gamma.rvs,
 'maxwell':stats.maxwell.rvs,
 'norm':stats.norm.rvs,
 'pareto':stats.pareto.rvs,
 't':stats.t.rvs,
}

In [None]:
# feature scaler + fine-turned SVM 
DIS = make_pipeline(scaler, Dis)

In [None]:
class Game:
    
    '''
    
    A game between two teams:
    
    - feature values sampled from Generator
    - win/loss predicted by Discriminator
    
    '''
    
    def __init__ (self, random_state = None):
        
        self.random_state = random_state # keep this to None for making simulations 
    
    def predict(self, team1, team2, num_games = 1):
        
        ''' predict the win or loss of  n game(s) played by two tems'''
        
        assert num_games >= 1, "at least one game must be played"
        # output numpy array
        team_1_feature_data = DATA[team1]
        team_2_feature_data = DATA[team2]
        features = []
        for feature_paras_1 in team_1_feature_data:
            sample_1 = self.sampling(feature_paras_1, size = num_games) # gives a list if num_games> 1
            features.append(sample_1) 
            
        for feature_paras_2 in team_2_feature_data:
            sample_2 = self.sampling(feature_paras_2, size = num_games) # gives a list if num_games> 1
            features.append(sample_2)
            
        features = np.array(features).T 
        win_loss = DIS.predict(features)
        
        return list(win_loss) # a list of win/loss from num_games
    
    
    def sampling(self, dic, size = 1, random_state = None):
        
        '''generate feature values used for making win/loss prediction'''
                        
        dis_name = list(dic.keys())[0] # get the type
        paras = list(dic.values())[0] # get the paras
    
        # get sample
        sample = GEN[dis_name](*paras, size = size,  random_state =  random_state)
            
        return sample 

In [None]:
class FinalTournament(Game):
    
    ''' Best-of-7 elimination, 16 teams, 4 rounds in total to win championship '''
    
    def __init__(self, n_games_per_group = 7, winning_threshold = 4, random_state = None):

        self.n_games_per_group  = n_games_per_group
        self.winning_threshold = winning_threshold
        self.team_list = None
        self.rounds = {} # keep track the number of times a team wins at each round 
        super().__init__(random_state)
        
    
    def simulate(self, group_list, n_simulation = 1, probs = True):
        
        ''' simulate the entire playoff n times and also record the accumulated wins'''
             
        # update the list of teams
        self.rounds = {}
        self.team_list = [i[0] for i in group_list] + [i[1] for i in group_list]
        
        for i in range(n_simulation):
            cham = self.one_time_simu(group_list)
        if probs:
            self.rounds_probs =  self._compute_probs()
            
    
    def one_time_simu(self, group_list, verbose = False, probs = False):
        
        ''' simulate the entire playoff once and also record the accumulated wins'''
        
        # update the list of teams if haven't done so
        if self.team_list == None: 
            self.team_list = [i[0] for i in group_list] + [i[1] for i in group_list]
        round_number, done = 0, 0
        while not done: 
            all_group_winners, group_list = self.play_round(group_list)
            # retrive round stats
            try:
                updated_round_stats = self.rounds[round_number]
            except KeyError:
                updated_round_stats = {}
                for team in self.team_list:
                    updated_round_stats[team] = 0
            # if a team wins, record + 1 
            for winner in all_group_winners:
                try: 
                    updated_round_stats[winner] += 1
                except KeyError:
                    pass     
            self.rounds[round_number] = updated_round_stats
            if verbose:
                print('{} round played'.format(round_number))
            if probs:
                self.rounds_probs = self._compute_probs()
            if type(group_list) != list: # if it becomes the final
                done = 1
            round_number += 1
            
        return group_list

        
    def play_round(self, group_list):
        
        '''play a round of games based of a list of paired teams'''
        
        all_group_winners = [] 
        # play each group and get the group winner
        for group in group_list:
            winner = self.play_n_games(group[0], group[1])
            all_group_winners.append(winner)
        
        if len(all_group_winners) > 1:
            new_group_list = []         
            for index in range(0, len(all_group_winners), 2):
                # first winner, second winner
                new_group = [all_group_winners[index], all_group_winners[index + 1]]
                new_group_list.append(new_group)
                
            return all_group_winners, new_group_list
        else:  
            return all_group_winners, winner
        
        
    def play_n_games(self, team1, team2):
        
        '''simulate data, and then use our classifier to predict win/loss'''
        
        result = Game().predict(team1, team2, self.n_games_per_group)
        if sum(result[:4]) == self.winning_threshold or sum(result) >= self.winning_threshold:
            winner = team1 # home team wins
        else:
            winner = team2 # visitor team wins
            
        return winner
    
    
    def _compute_probs(self):
        
        '''prob = wins for a team / sum of wins for all teams at a particular round'''
        
        rounds_probs = copy.deepcopy(self.rounds)
        for round_number, round_stats in rounds_probs.items():
            m = np.sum(list(round_stats.values()))
            for k, v in rounds_probs[round_number].items():
                rounds_probs[round_number][k] = v / m
                
        return rounds_probs

In [None]:
# the below roster is based on 2019-2020 NBA playoffs
# https://en.wikipedia.org/wiki/2019%E2%80%9320_NBA_season

group_list = [
     # Eastern Conference
     ('Bucks', 'Magic'),  # group A 
     ('Pacers', 'Heat'), # group B
    
     ('Celtics', '76ers'), # group C
     ('Raptors', 'Nets'), # group D
    
     # Western Conference
     ('Lakers','Trail Blazers'),  # group E
     ('Rockets','Thunder'), # group F
    
     ('Nuggets', 'Jazz'), # group G
     ('Clippers', 'Mavericks')] # group H

In [None]:
%%time

# initiate a playoff
playoff = FinalTournament()
# simulate the playoff 5,000 times
playoff.simulate(group_list, n_simulation = 5000)

In [None]:
# see the winning probabilities from 5,000 playoffs
playoff.rounds_probs

# 5. Visualization & Analysis

In [None]:
def plotting(rounds_data):
    
    rounds_stats = list(rounds_data.values())
    team_names = list(rounds_stats[0].keys())
    
    # x is number of rounds used for labels, y is a 2-D array of (n_teams, n_rounds) used for data
    x = list(rounds_data.keys())
    y = np.array([list(r.values()) for r in rounds_stats]).T 
    
    # we need at least 16 different colors, one for each team
    c_1 =  sns.color_palette('tab10', n_colors = 10)
    c_2 =  sns.color_palette("pastel", n_colors = 10)
    color_map = c_1 + c_2 
    
    fig = plt.figure()
    plt.stackplot(x, y, labels = team_names, colors = color_map) 
    plt.legend(bbox_to_anchor=(1.1, 1.1), loc = 'upper left', fontsize=13)
    plt.xticks(x, fontsize=14)
    plt.yticks(fontsize=14)
    plt.xlabel('Round Number', fontsize = 15)
    plt.title('Winning probabilities by all Teams & Rounds', pad = 20, fontsize = 24)
    plt.tight_layout()
    plt.show()
    
    return fig

In [None]:
# check that a team's wins should get less and less in later rounds
fig = plotting(playoff.rounds)

In [None]:
# plot the results: probabilities of winning for all teams at each round
fig = plotting(playoff.rounds_probs)

### ☝️ Observations

1. Based on overall winning probabilities, the top 3 teams are: **Bucks > Lakers > Nuggets**.

2. Based on winning probabilities in the final round, the top 3 teams are: **Bucks > Lakers > Heat**.

3. **Bucks** has a much higher overall chance of winning, if it can get through the early rounds.

4. If we were the **Lakers**, we would avoid the **Bucks**; if we were the **Heat** or **Nuggets**, we would avoid both the **Bucks** and **Lakers**.

### 🌗 Results Comparison (actual 2019-2020 NBA playoff results)

In the actual 2019-2020 playoff, **Bucks** had lost to **Toronto Raptors** narrowly (3-4) at the 1st round. Lakers had, as we predicted, beat both **Nuggets** and **Heat** in the last two rounds and became the champion. These are mostly consistent with our model's predictions given the above observations. And since our model have not seen the 2019-2020 playoff data, it didn't "cheat" to get the results.

### 🤔 Debug: Why Raptors was able to beat Bucks?

From our observations, though Bucks' chances of winning in the early rounds are less than in later rounds,it still is much better than Raptors, how did Bucks lose? One hypothesis is that a strong team(e.g., Bucks), when facing particular type of opponent(e.g., Raptors), may become weaker. To comfirm this hypothesis, we can check the performance of Bucks when up against Raptors in our data.

In [None]:
# extract games played by Bucks and Raptors

df_br_1 = df.loc[(df['HOME_TEAM_ID'] == 'Bucks') & (df['VISITOR_TEAM_ID'] == 'Raptors')].reset_index(drop=True)
df_br_2 = df.loc[(df['HOME_TEAM_ID'] == 'Raptors') & (df['VISITOR_TEAM_ID'] == 'Bucks')].reset_index(drop=True)


In [None]:
print('Bucks won {} out of {} when being the home team'.format(
    sum(df_br_1['HOME_TEAM_WINS']), df_br_1.shape[0])) 
print('Bucks won {} out of {} when being the away team'.format(
    df_br_2.shape[0] - sum(df_br_2['HOME_TEAM_WINS']), df_br_2.shape[0])) 

In [None]:
# bucks against Rapters
print("Bucks' chance of winning against Raptors: ", (18 + 16) /(35 + 36))

☝️The above explains why Bucks lost: It has little less than random chance of winning against Raptors. In sum, when we train the win/loss classifier, we should take into account which teams are competing as well!

## 🙂 Improvements

1. **Use more and better features**: Powerful simulations usually have more variables to cover more scenarios of the playoff. Using player level statistics is one way to go.

2. **Use better win/loss model**: Our trained SVM Discriminator only has about 70%+ accuracy, it is sufficient but nowhere near perfect. Improvements can be achieved by adding more data features or using a more powerful model such as Neural Network or ensemble model.

3. **Do better at distribution fitting**: In the projects, we used only a handful of distributions to speed up the fitting. To reduce the fitting errors, we can use more complex distributions. The lower the fitting error, the more realistic the samples we can generate.

4. **Run more simulations**: Should we have more variables to account for in our win/loss Discriminator, we need to run more simulations to get the precise probabilities.