In [4]:
import numpy as np
import pandas as pd
import os

In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split # for train-test split 
from sklearn.preprocessing import StandardScaler # for feature scaling
from sklearn.model_selection import GridSearchCV # for fine-tuning
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report, confusion_matrix, accuracy_score, roc_auc_score # for evaluation
from sklearn.pipeline import make_pipeline # for prediction

In [6]:
# for Generator
from scipy import stats # for sampling
from fitter import Fitter # for fitting the best distribution
import copy # for copying nested dictionaries

In [7]:
import matplotlib.pyplot as plt  # for visualization 
import seaborn as sns  # for coloring 

# set style of graphs
plt.style.use('dark_background')
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5

In [8]:
train_data = pd.read_csv('dataset/train_data.csv')

In [9]:
feature_list = list(train_data.columns)
feature_list

['GAME_DATE_EST',
 'GAME_ID',
 'GAME_STATUS_TEXT',
 'HOME_TEAM_ID',
 'VISITOR_TEAM_ID',
 'SEASON',
 'TEAM_ID_home',
 'PTS_home',
 'FG_PCT_home',
 'FT_PCT_home',
 'FG3_PCT_home',
 'AST_home',
 'REB_home',
 'TEAM_ID_away',
 'PTS_away',
 'FG_PCT_away',
 'FT_PCT_away',
 'FG3_PCT_away',
 'AST_away',
 'REB_away',
 'HOME_TEAM_WINS']

In [10]:
#✍To predict win/loss of a game, we can use one of the two ways:

#1. Select only one feature (points), the win/loss prediction is just based on which team has the higher point.
#2. Select features other than points, the win/loss is then based on the prediction of a classifier which takes those features as inputs.

# In this notebook, we will use option (2) as it offers better range of uncertainty for simulation.

selected_features = [
    'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home',
    'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away',
    ]

# check the features we selected
X = train_data[selected_features]
X.head()

Unnamed: 0,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away
0,0.409,0.929,0.308,32.0,56.0,0.372,0.737,0.375,22.0,31.0
1,0.446,0.611,0.4,30.0,58.0,0.403,0.818,0.381,20.0,36.0
2,0.47,0.8,0.333,25.0,38.0,0.488,0.724,0.385,20.0,44.0
3,0.389,0.947,0.238,26.0,54.0,0.395,0.895,0.364,20.0,34.0
4,0.466,0.792,0.5,29.0,42.0,0.43,0.75,0.45,15.0,37.0


In [11]:
y = train_data['HOME_TEAM_WINS']
y.head()

0    1
1    1
2    0
3    1
4    1
Name: HOME_TEAM_WINS, dtype: int64

In [12]:
X = X.to_numpy()
y = y.to_numpy()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("X shape", X_train.shape, "y shape", y_train.shape)

X shape (16849, 10) y shape (16849,)


##### Utility function

In [14]:
def evaluate(y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(f"accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"AUC: {roc_auc_score(y_test, y_pred)}")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")

### 1. Fitting SVM

In [None]:
# for SVM a scaler it can improve accuracy of the model

scaler = StandardScaler() # initialize an instance 
X_train_svm = scaler.fit_transform(X_train)

In [None]:
%%time 

# train SVM

support_vector = SVC() # initialize a model
support_vector.fit(X_train_svm, y_train) # fit(train) it with the training data and targets

# check test score 
y_pred = support_vector.predict(X_test) 

In [None]:
evaluate(y_test, y_pred)

#### Grid search SVM

In [None]:
%%time 

# fine-tuning hyperparameters
#param_grid_svm = {'C': [0.1, 1, 10],
#              'gamma': [1, 0.5, 0.1, 0.01, 0.001, 0.0001],
#              'kernel': ['linear', 'poly', 'sigmoid', 'rbf']
#            }
param_grid_svm = {'C': [0.1],
              'gamma': [0.1],
              'kernel': ['linear']
            }

grid_search_svm = GridSearchCV(estimator=support_vector, param_grid=param_grid_svm, cv=10, verbose=2, scoring='accuracy', 
                            n_jobs = -1, return_train_score=True)

In [None]:
grid_search_svm.fit(X_train_svm, y_train)

In [None]:
grid_search_svm.best_params_

# {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}

In [None]:
%%time 

# train SVM

support_vector = SVC(C=0.1, gamma=1, kernel='linear') # initialize a model
support_vector.fit(X_train_svm, y_train) # fit(train) it with the training data and targets

# check test score 
y_pred = support_vector.predict(X_test) 

In [None]:
evaluate(y_test, y_pred)

### 2. Fitting Random Forest

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(X_train, y_train);

In [None]:
rf_random.best_params_

#### Default model

In [None]:
#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=10, random_state=42)

# clf = RandomForestClassifier(n_estimators=20, random_state=0)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

evaluate(y_test, y_pred)

#### Evaluate the Best Random Search Model

In [None]:
#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=2000, min_samples_split=2, min_samples_leaf=4, max_features='auto', max_depth=30, bootstrap=False)

# clf = RandomForestClassifier(n_estimators=20, random_state=0)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [None]:
evaluate(y_test, y_pred)

#### Grid Search

In [None]:
# best param:

#{'bootstrap': True,
# 'max_depth': 30,
# 'max_features': 'auto',
# 'min_samples_leaf': 2,
# 'min_samples_split': 2,
# 'n_estimators': 500}

In [None]:
#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=500, min_samples_split=2, min_samples_leaf=2, max_features='auto', max_depth=30, bootstrap='True')

# clf = RandomForestClassifier(n_estimators=20, random_state=0)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
# another round

param_grid_rf2 = {
    'n_estimators': [100, 200, 300, 400],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [2],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 30, 50],
    'bootstrap': [True, False]
    }

# Create a base model
rf = RandomForestClassifier(random_state=42)

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf2, cv=10, verbose=2, scoring='accuracy', 
                            n_jobs = -1, return_train_score=True)

In [None]:
grid_search_rf.fit(X_train, y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
# best param:

#{'bootstrap': True,
# 'max_depth': 30,
# 'max_features': 'auto',
# 'min_samples_leaf': 2,
# 'min_samples_split': 2,
# 'n_estimators': 400}

In [None]:
#Create a Gaussian Classifier
random_forest = RandomForestClassifier(n_estimators=400, min_samples_split=2, min_samples_leaf=2, max_features='auto', max_depth=30, bootstrap='True')

# clf = RandomForestClassifier(n_estimators=20, random_state=0)

#Train the model using the training sets y_pred=clf.predict(X_test)
random_forest.fit(X_train,y_train)

y_pred=random_forest.predict(X_test)

In [None]:
evaluate(y_test, y_pred)

In [None]:
Dis_rf = grid_search_rf.best_estimator_
print(Dis_rf)

### 3. Naive Bayes

#### Gaussian Process

In [None]:
gauss_nb = GaussianNB()

In [None]:
gauss_nb.fit(X_train, y_train)
y_pred = gauss_nb.predict(X_test)

In [None]:
evaluate(y_test, y_pred)

#### Grid search Naive Bayes

In [None]:
param_grid = {
    'var_smoothing': np.logspace(0,-9, num=100)
    }

In [None]:
grid_search_nb = GridSearchCV(estimator=gauss_nb, param_grid=param_grid, cv=10, verbose=2, scoring='accuracy', 
                            n_jobs = -1, return_train_score=True)

In [None]:
grid_search_nb.fit(X_train, y_train)

In [None]:
grid_search_nb.best_params_

#### Optimization

In [15]:
gauss_nb = GaussianNB(var_smoothing=1.519911082952933e-07)

In [16]:
gauss_nb.fit(X_train, y_train)
y_pred = gauss_nb.predict(X_test)

In [17]:
evaluate(y_test, y_pred)

[[2353  612]
 [ 608 3649]]
              precision    recall  f1-score   support

           0       0.79      0.79      0.79      2965
           1       0.86      0.86      0.86      4257

    accuracy                           0.83      7222
   macro avg       0.83      0.83      0.83      7222
weighted avg       0.83      0.83      0.83      7222

accuracy: 0.8310717252838549
AUC: 0.8253841604404372
MSE: 0.1689282747161451
MAE: 0.1689282747161451


In [18]:
Dis_nb = grid_search_nb.best_estimator_
print(Dis_nb)

NameError: name 'grid_search_nb' is not defined

### 3. Fitting a Generator

In [19]:
# Like before, we had held out data from 2019-2020 playoff for real testing
# Though large data is essential for fitting, for time-series problems, we give priority to the recent data most reflective of team's recent ability.
# Since we aim to predict 2019-2020 playoff, here we will just fit the data from that regular session which starts in Oct, 2019.

df_ = train_data.loc[train_data['SEASON'] > 2019].reset_index(drop=True)
df_.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2020-12-11,12000005,Final,Trail Blazers,Kings,2020,1610612757,127.0,0.511,0.727,...,22.0,55.0,1610612758,102.0,0.368,0.727,0.32,24.0,44.0,1
1,2020-12-11,12000004,Final,Lakers,Clippers,2020,1610612747,87.0,0.357,0.8,...,15.0,46.0,1610612746,81.0,0.357,0.647,0.278,15.0,55.0,1
2,2020-12-11,12000003,Final,Bulls,Rockets,2020,1610612741,104.0,0.413,0.73,...,20.0,56.0,1610612745,125.0,0.434,0.714,0.462,25.0,47.0,0
3,2020-12-11,12000002,Final,Pistons,Knicks,2020,1610612765,84.0,0.325,0.88,...,17.0,41.0,1610612752,90.0,0.458,0.474,0.217,23.0,54.0,0
4,2020-12-11,12000001,Final,Hawks,Magic,2020,1610612737,112.0,0.37,0.829,...,20.0,61.0,1610612753,116.0,0.457,0.69,0.286,26.0,49.0,0


In [20]:
selected_distributions = [
    'norm','t', 'f', 'chi', 'cosine', 'alpha', 
    'beta', 'gamma', 'dgamma', 'dweibull',
    'maxwell', 'pareto', 'fisk']

In [21]:
unique_teams = train_data['HOME_TEAM_ID'].unique() # extract all the unique teams

# Since we don't care about whether the team was a host or visitor in each game, 
# we can just combine the features for all games.

# Get all the data for teams
all_team_sim_data = {}

for team_name in unique_teams:
    
    # find games where the team is either the host or guest
    df_team = df_.loc[(df_['HOME_TEAM_ID'] == team_name) | (df_['VISITOR_TEAM_ID'] == team_name)]
    # it is home team, select the first 5 features
    df_1 = df_team.loc[df_team['HOME_TEAM_ID'] == team_name][selected_features[:5]]
    # it is guest team, select the first 5 features
    df_0 = df_team.loc[df_team['VISITOR_TEAM_ID'] == team_name][selected_features[5:]]

    # combine them
    df_0.columns = df_1.columns # before concating, match the column names
    df_s = pd.concat([df_1, df_0], axis = 0)
    
    # convert the pandas.DataFrame to numpy array
    all_team_sim_data[team_name] = df_s.to_numpy()



In [22]:
# data format:
#   team_name => list of feature distributions => dictionary with distribution name and parameters
#   e.g.,
#   megadata = {
      #'Timberwolves': [{'beta': (0.23, 0.3, 0.3, 0.4)}, {'nor': (0.23, 0.3,)}, ..], 
      #'Warriors':[{}, {},...]
      #  }
    
megadata = {} # store the data that our Generator will rely on
for team_name in unique_teams:
    
    feature_dis_paras = []
    data = all_team_sim_data[team_name]
    
    # 5 features for each team
    for i in range(5): 
        f = Fitter(data[:, i]) # initalize a Fitter instance
        f.distributions = selected_distributions # use only the selected distributions (faster)
        f.fit() # do the fitting 
        best_paras = f.get_best(method='sumsquare_error') # get the best fitted paras
        feature_dis_paras.append(best_paras)
        
    megadata[team_name] = feature_dis_paras
    
# print('Features for all teams have been fitted!')

### 4. Simulation

In [49]:
DATA = megadata.copy() # data that Generator must rely on

GEN = {
 'alpha': stats.alpha.rvs,
 'beta': stats.beta.rvs,
 'chi': stats.chi.rvs,
 'cosine': stats.cosine.rvs,
 'dgamma': stats.dgamma.rvs,
 'dweibull':stats.dweibull.rvs,
 'f':stats.f.rvs,
 'fisk':stats.fisk.rvs,
 'gamma': stats.gamma.rvs,
 'maxwell':stats.maxwell.rvs,
 'norm':stats.norm.rvs,
 'pareto':stats.pareto.rvs,
 't':stats.t.rvs,
}

In [24]:
# feature scaler + fine-turned SVM 
# DIS = make_pipeline(scaler, support_vector)
#DIS = make_pipeline(random_forest)
DIS = make_pipeline(gauss_nb)

Process: 

1. sampling: "generate feature values used for making win/loss prediction"
2. predict: "predict the win or loss of  n game(s) played by two tems"

In [55]:
class Game:
    
    '''
    
    A game between two teams:
    
    - feature values sampled from Generator
    - win/loss predicted by Discriminator
    
    '''
    
    def __init__ (self, random_state = None):
        
        self.random_state = random_state # keep this to None for making simulations 
    
    def predict(self, team1, team2, num_games = 1):
        
        ''' predict the win or loss of  n game(s) played by two tems'''
        
        assert num_games >= 1, "at least one game must be played"
        # output numpy array
        team_1_feature_data = DATA[team1]
        team_2_feature_data = DATA[team2]
        
        features = []
        for feature_paras_1 in team_1_feature_data:
            sample_1 = self.sampling(feature_paras_1, size = num_games) # gives a list if num_games> 1
            features.append(sample_1) 
            
        for feature_paras_2 in team_2_feature_data:
            sample_2 = self.sampling(feature_paras_2, size = num_games) # gives a list if num_games> 1
            features.append(sample_2)
            
        features = np.array(features).T 
        win_loss = DIS.predict(features)
        # print(f"team_1: {team1}, team_2: {team2}, win_loss: {list(win_loss)}")
        
        return list(win_loss) # a list of win/loss from num_games
    
    
    def sampling(self, dic, size = 1, random_state = None):
        
        '''generate feature values used for making win/loss prediction'''
                        
        dis_name = list(dic.keys())[0] # get the type
        paras = list(dic.values())[0] # get the paras
    
        # get sample
        sample = GEN[dis_name](*paras, size = size,  random_state =  random_state)

        print(sample)
            
        return sample

Process:

1. play_n_games: "simulate data, and then use our classifier to predict win/loss"
2. play_round: "play a round of games based of a list of paired teams"
3. one_time_sim: "simulate the entire playoff once and also record the accumulated wins"
4. simulate: "simulate the entire playoff n times and also record the accumulated wins"

In [56]:
class FinalTournament(Game):
    
    ''' Best-of-7 elimination, 16 teams, 4 rounds in total to win championship '''
    
    def __init__(self, n_games_per_group = 7, winning_threshold = 4, random_state = None):

        self.n_games_per_group  = n_games_per_group
        self.winning_threshold = winning_threshold
        self.team_list = None
        self.rounds = {} # keep track the number of times a team wins at each round 
        super().__init__(random_state)
        
    
    def simulate(self, group_list, n_simulation = 1, probs = True):
        
        ''' simulate the entire playoff n times and also record the accumulated wins'''
             
        # update the list of teams
        self.rounds = {}
        self.team_list = [i[0] for i in group_list] + [i[1] for i in group_list]
        
        for i in range(n_simulation):
            # print(f"epoch number: {i}")
            cham = self.one_time_simu(group_list)
        if probs:
            self.rounds_probs = self._compute_probs()
            
    
    def one_time_simu(self, group_list, verbose = False, probs = False):
        
        ''' simulate the entire playoff once and also record the accumulated wins'''
        
        # update the list of teams if haven't done so
        if self.team_list == None: 
            self.team_list = [i[0] for i in group_list] + [i[1] for i in group_list]
        round_number, done = 0, 0
        while not done: 
            all_group_winners, group_list = self.play_round(group_list)
            # retrive round stats
            try:
                updated_round_stats = self.rounds[round_number]
            except KeyError:
                updated_round_stats = {}
                for team in self.team_list:
                    updated_round_stats[team] = 0
            # if a team wins, record + 1 
            for winner in all_group_winners:
                try: 
                    updated_round_stats[winner] += 1
                except KeyError:
                    pass     
            self.rounds[round_number] = updated_round_stats
            if verbose:
                print('{} round played'.format(round_number))
            if probs:
                self.rounds_probs = self._compute_probs()
            if type(group_list) != list: # if it becomes the final
                done = 1
            round_number += 1
            
        return group_list

        
    def play_round(self, group_list):
        
        '''play a round of games based of a list of paired teams'''
        
        all_group_winners = [] 
        # play each group and get the group winner
        for group in group_list:
            winner = self.play_n_games(group[0], group[1])
            all_group_winners.append(winner)
        
        if len(all_group_winners) > 1:
            new_group_list = []         
            for index in range(0, len(all_group_winners), 2):
                # first winner, second winner
                new_group = [all_group_winners[index], all_group_winners[index + 1]]
                new_group_list.append(new_group)
                
            return all_group_winners, new_group_list
        else:  
            return all_group_winners, winner
        
        
    def play_n_games(self, team1, team2):
        
        
        '''simulate data, and then use our classifier to predict win/loss'''
        result = Game().predict(team1, team2, self.n_games_per_group)
        if sum(result[:4]) == self.winning_threshold or sum(result) >= self.winning_threshold:
            winner = team1 # home team wins
        else:
            winner = team2 # visitor team wins
            
        return winner
    
    
    def _compute_probs(self):
        
        '''prob = wins for a team / sum of wins for all teams at a particular round'''
        
        rounds_probs = copy.deepcopy(self.rounds)
        for round_number, round_stats in rounds_probs.items():
            m = np.sum(list(round_stats.values()))
            for k, v in rounds_probs[round_number].items():
                rounds_probs[round_number][k] = v / m
                
        return rounds_probs

In [27]:
#2021
group_list_2021 = [
     # Eastern Conference
     ('76ers', 'Wizards'),  # group A 1 
     ('Knicks', 'Hawks'), # group B 4 
    
     ('Bucks', 'Heat'), # group C 3 
     ('Nets', 'Celtics'), # group D 2
    
     # Western Conference
     ('Jazz','Grizzlies'),  # group E 1 
     ('Clippers','Mavericks'), # group F 4 
    
     ('Nuggets', 'Trail Blazers'), # group G 3 
     ('Suns', 'Lakers')] # group H 2

In [26]:
#2022
group_list_2022 = [
     # Eastern Conference
     ('Heat', 'Hawks'),  # group A 1 
     ('76ers', 'Raptors'), # group B 4 
    
     ('Bucks', 'Bulls'), # group C 3 
     ('Celtics', 'Nets'), # group D 2
    
     # Western Conference
     ('Suns','Pelicans'),  # group E 1 
     ('Mavericks','Jazz'), # group F 4 
    
     ('Warriors', 'Nuggets'), # group G 3 
     ('Grizzlies', 'Timberwolves')] # group H 2

In [35]:
def ComputeErrorSimulation(dict_obj):
    correct = 0
    error = 0
    # round 0
    if(dict_obj[0]['76ers'] < dict_obj[0]['Wizards']):
        error = error + 1
    else: correct = correct + 1
    if(dict_obj[0]['Knicks'] > dict_obj[0]['Hawks']):
        error = error + 1
    else: correct = correct + 1
    if(dict_obj[0]['Bucks'] < dict_obj[0]['Heat']):
        error = error + 1
    else: correct = correct + 1
    if(dict_obj[0]['Nets'] < dict_obj[0]['Celtics']):
        error = error + 1
    else: correct = correct + 1
    if(dict_obj[0]['Jazz'] < dict_obj[0]['Grizzlies']):
        error = error + 1
    else: correct = correct + 1
    if(dict_obj[0]['Clippers'] < dict_obj[0]['Mavericks']):
        error = error + 1
    else: correct = correct + 1
    if(dict_obj[0]['Nuggets'] < dict_obj[0]['Trail Blazers']):
        error = error + 1
    else: correct = correct + 1
    if(dict_obj[0]['Suns'] < dict_obj[0]['Lakers']):
        error = error + 1
    else: correct = correct + 1

    # round 1
    if(dict_obj[0]['76ers'] > dict_obj[0]['Hawks']):
        error = error + 1
    else: correct = correct + 1
    if(dict_obj[0]['Bucks'] < dict_obj[0]['Nets']):
        error = error + 1
    else: correct = correct + 1
    if(dict_obj[0]['Jazz'] > dict_obj[0]['Clippers']):
        error = error + 1
    else: correct = correct + 1
    if(dict_obj[0]['Nuggets'] > dict_obj[0]['Suns']):
        error = error + 1
    else: correct = correct + 1

    # round 2
    if(dict_obj[0]['Clippers'] > dict_obj[0]['Suns']):
        error = error + 1
    else: correct = correct + 1
    if(dict_obj[0]['Hawks'] > dict_obj[0]['Bucks']):
        error = error + 1
    else: correct = correct + 1

    # round 3
    if(dict_obj[0]['Bucks'] < dict_obj[0]['Suns']):
        error = error + 1
    else: correct = correct + 1

    return 100*error/15
 

In [45]:
def ComputeSimulation(group_list):
    simulations = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
    threshold = 20
    result = {}
    for i in simulations:
        playoff.simulate(group_list_2021, n_simulation = i)
        result = playoff.rounds_probs
        error = ComputeErrorSimulation(result)
        if threshold < error and threshold > 7:
            return result
    return result

In [52]:
%%time

# initiate a playoff
playoff = FinalTournament()

# simulate the playoff 5,000 times

playoff.simulate(group_list_2021, n_simulation = 5000)


[0, 0, 1, 1, 1, 0, 0]
[1, 1, 0, 1, 0, 0, 1]
[0, 0, 1, 1, 0, 0, 1]
[1, 0, 1, 1, 1, 0, 0]
[1, 0, 1, 0, 0, 0, 0]
[1, 0, 1, 0, 1, 0, 1]
[0, 1, 1, 1, 1, 0, 1]
[1, 1, 1, 1, 1, 1, 0]
[0, 0, 0, 0, 1, 0, 0]
[0, 1, 0, 1, 0, 0, 1]
[1, 0, 1, 1, 1, 0, 1]
[0, 1, 1, 0, 0, 0, 0]
[1, 1, 0, 0, 0, 0, 0]
[0, 0, 1, 0, 0, 0, 1]
[1, 1, 1, 1, 1, 0, 1]
[0, 0, 0, 0, 0, 1, 1]
[0, 0, 1, 0, 0, 0, 0]
[0, 1, 1, 1, 1, 0, 1]
[1, 0, 1, 0, 1, 0, 1]
[1, 1, 0, 1, 1, 1, 1]
[1, 1, 0, 0, 1, 1, 1]
[1, 1, 1, 1, 0, 0, 1]
[0, 1, 1, 1, 1, 1, 1]
[1, 1, 0, 0, 0, 1, 0]
[0, 0, 0, 1, 0, 1, 1]
[0, 0, 0, 0, 1, 1, 1]
[0, 0, 0, 1, 0, 1, 1]
[1, 0, 0, 0, 0, 0, 0]
[1, 1, 0, 1, 0, 0, 0]
[0, 1, 0, 0, 1, 1, 1]
[0, 0, 1, 1, 0, 1, 0]
[1, 0, 1, 0, 0, 0, 1]
[1, 1, 0, 1, 1, 0, 0]
[1, 1, 1, 1, 1, 0, 1]
[1, 1, 0, 0, 1, 1, 1]
[1, 1, 0, 1, 0, 1, 1]
[1, 0, 0, 1, 0, 0, 1]
[1, 1, 0, 1, 1, 0, 1]
[1, 1, 1, 0, 0, 1, 0]
[1, 1, 1, 0, 0, 1, 0]
[0, 1, 0, 0, 0, 0, 0]
[1, 0, 0, 1, 0, 0, 1]
[0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 0, 0, 0, 0]
[0, 0, 1, 1, 0, 0, 0]
[1, 0, 0, 

In [44]:
playoff.rounds_probs

{0: {'76ers': 0.072925,
  'Knicks': 0.0494,
  'Bucks': 0.109475,
  'Nets': 0.114225,
  'Jazz': 0.07425,
  'Clippers': 0.10685,
  'Nuggets': 0.120275,
  'Suns': 0.1098,
  'Wizards': 0.052075,
  'Hawks': 0.0756,
  'Heat': 0.015525,
  'Celtics': 0.010775,
  'Grizzlies': 0.05075,
  'Mavericks': 0.01815,
  'Trail Blazers': 0.004725,
  'Lakers': 0.0152},
 1: {'76ers': 0.0991,
  'Knicks': 0.02285,
  'Bucks': 0.1353,
  'Nets': 0.10385,
  'Jazz': 0.08305,
  'Clippers': 0.10225,
  'Nuggets': 0.15295,
  'Suns': 0.09265,
  'Wizards': 0.07035,
  'Hawks': 0.0577,
  'Heat': 0.00845,
  'Celtics': 0.0024,
  'Grizzlies': 0.0585,
  'Mavericks': 0.0062,
  'Trail Blazers': 0.00075,
  'Lakers': 0.00365},
 2: {'76ers': 0.0564,
  'Knicks': 0.0058,
  'Bucks': 0.2057,
  'Nets': 0.1536,
  'Jazz': 0.0632,
  'Clippers': 0.0945,
  'Nuggets': 0.1857,
  'Suns': 0.1081,
  'Wizards': 0.0426,
  'Hawks': 0.0269,
  'Heat': 0.0067,
  'Celtics': 0.0023,
  'Grizzlies': 0.0454,
  'Mavericks': 0.0013,
  'Trail Blazers': 0.0,
 

### 5. Visualization & Analysis

In [None]:
def plotting(rounds_data):
    
    rounds_stats = list(rounds_data.values())
    team_names = list(rounds_stats[0].keys())
    
    # x is number of rounds used for labels, y is a 2-D array of (n_teams, n_rounds) used for data
    x = list(rounds_data.keys())
    y = np.array([list(r.values()) for r in rounds_stats]).T 
    
    # we need at least 16 different colors, one for each team
    c_1 =  sns.color_palette('tab10', n_colors = 10)
    c_2 =  sns.color_palette("pastel", n_colors = 10)
    color_map = c_1 + c_2 
    
    fig = plt.figure()
    plt.stackplot(x, y, labels = team_names, colors = color_map) 
    plt.legend(bbox_to_anchor=(1.1, 1.1), loc = 'upper left', fontsize=13)
    plt.xticks(x, fontsize=14)
    plt.yticks(fontsize=14)
    plt.xlabel('Round Number', fontsize = 15)
    plt.title('Winning probabilities by all Teams & Rounds', pad = 20, fontsize = 24)
    plt.tight_layout()
    plt.show()
    
    return fig

In [None]:
# check that a team's wins should get less and less in later rounds
fig = plotting(playoff.rounds)

In [None]:
# plot the results: probabilities of winning for all teams at each round
fig = plotting(playoff.rounds_probs)