### IMPORTS

In [140]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import GradientBoostingClassifier

## DATA LOADING

In [141]:
# Set pandas options to display all columns
pd.set_option('display.max_columns', None)

# Set pandas options to display all rows
pd.set_option('display.max_rows', None)

teams = pd.read_csv("../../datasets/teams.csv")

### INITIAL DATA CLEANING

In [142]:
# Converting Year to int
teams['year'] = teams['year'].astype(int)

# Mapping playoff column to 0 and 1
teams['playoff'] = teams['playoff'].map({'Y': 1, 'N': 0})

# Now we need to shift the playoff column from a year to the next so the playoff is predicted by the year before
teams.sort_values(['franchID', 'year'])
teams['playoff'] = teams.groupby('franchID')['playoff'].shift(-1)
teams.loc[teams['franchID'] != teams['franchID'].shift(-1), 'playoff'] = np.nan


# Checking if there are collumns with all rows the same and dropping them
cols_with_same_values = []
for col in teams.columns:
    if len(teams[col].unique()) == 1:
        cols_with_same_values.append(col)

teams = teams.drop(cols_with_same_values, axis=1)

# Check the number of nulls in each column
null_counts = teams.isnull().sum()
print("\nNull Counts:\n", null_counts)


Null Counts:
 year            0
tmID            0
franchID        0
confID          0
rank            0
playoff        20
firstRound     62
semis         104
finals        122
name            0
o_fgm           0
o_fga           0
o_ftm           0
o_fta           0
o_3pm           0
o_3pa           0
o_oreb          0
o_dreb          0
o_reb           0
o_asts          0
o_pf            0
o_stl           0
o_to            0
o_blk           0
o_pts           0
d_fgm           0
d_fga           0
d_ftm           0
d_fta           0
d_3pm           0
d_3pa           0
d_oreb          0
d_dreb          0
d_reb           0
d_asts          0
d_pf            0
d_stl           0
d_to            0
d_blk           0
d_pts           0
won             0
lost            0
GP              0
homeW           0
homeL           0
awayW           0
awayL           0
confW           0
confL           0
min             0
attend          0
arena           0
dtype: int64


As the columns `firstRound`, `semis` and `finals` are the ones that have null values and can take the values of 'W', 'L, or NaN we need to find a way to fix this. Since the most important part for our prediction is the regular season performance we could just remove those columns. However, we decided to keep them and fill the NaN values with -1, the 'W' values with 1 and the 'L' values with 0. This way we can use these columns as features for the model.

In [143]:
# Define a custom function to replace values
def replace_values(value):
    if pd.isna(value):
        return -1
    elif value == 'W':
        return 1
    elif value == 'L':
        return 0
    return value

# Apply the custom function to the columns
teams['firstRound'] = teams['firstRound'].apply(replace_values)
teams['semis'] = teams['semis'].apply(replace_values)
teams['finals'] = teams['finals'].apply(replace_values)

teams.sort_values(by=['franchID','year'], inplace=True)

Now we need to find a way to treat categorical features such as `name` and `arena`. For that, we will check the number of unique values for each of the categorical features and decide what strategy to use.

In [144]:
# Select categorical columns
categorical_columns = teams.select_dtypes(include=['object']).columns

# Get the number of unique values for each categorical column
unique_values = teams[categorical_columns].nunique()

print(unique_values)

tmID        20
franchID    18
confID       2
name        20
arena       22
dtype: int64


Considering the number of unique values is too high and the columns generated on one-hot encoding would be too many, we decided to use `Sckit-learn`'s `LabelEncoder` to transform the categorical features into numerical ones as there is no ordinal relationship between the values.

In [145]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in categorical_columns:
    teams[col] = le.fit_transform(teams[col])

categorical_df = teams[categorical_columns]
print(categorical_df)

     tmID  franchID  confID  name  arena
0       0         0       0     0     11
1       0         0       0     0     11
2       1         1       0     1      4
3       1         1       0     1      4
4       1         1       0     1      4
5       1         1       0     1      4
6       1         1       0     1      4
7       1         1       0     1      4
8       1         1       0     1     17
9       2         2       0     2     19
10      2         2       0     2     19
11      2         2       0     2     19
12      2         2       0     2     19
13      3         3       0     3     12
14      3         3       0     3     12
15      3         3       0     3     12
16      3         3       0     3     12
86     12         4       0    12      3
87     12         4       0    12      3
88     12         4       0    12      3
17      4         4       0     4     10
18      4         4       0     4     10
19      4         4       0     4     10
20      4       

Now for statistical features, we decided to create new features that represent the team's performance in the regular season. We will create the following features:

- win_rate: the ratio of games won to games played;
- points_diff: the difference between the points scored and the points conceded;
- poss: the number of possessions;
- off_rating: the number of points scored per 100 possessions;
- def_rating: the number of points conceded per 100 possessions;
- margin: the difference between the offensive and defensive ratings;
- net_rating: an estimate of the team's point differential per 100 possessions;
- pace: the number of possessions per 40 minutes;
- ts_pct: the true shooting percentage - A measure of shooting efficiency that takes into account 2-point field goals, 3-point field goals, and free throws;


In [146]:
teams['win_rate'] = teams['won'] / (teams['won'] + teams['lost'])
teams['point_diff'] = teams['o_pts'] - teams['d_pts']
teams['Tm_Poss'] = (teams['o_fga'] + 0.4 * teams['o_fta'] - 
                    1.07 * (teams['o_oreb'] / (teams['o_oreb'] + teams['d_dreb'])) * 
                    (teams['o_fga'] - teams['o_fgm']) + teams['o_to'])

teams['Opp_Poss'] = (teams['d_fga'] + 0.4 * teams['d_fta'] - 
                     1.07 * (teams['d_oreb'] / (teams['d_oreb'] + teams['o_dreb'])) * 
                     (teams['d_fga'] - teams['d_fgm']) + teams['d_to'])

teams['poss'] = 0.5 * (teams['Tm_Poss'] + teams['Opp_Poss'])

teams['off_rating'] = (teams['o_pts'] / teams['poss']) * 100

# Calculate defensive rating
teams['def_rating'] = (teams['d_pts'] / teams['poss']) * 100

# Calculate margin
teams['margin'] = teams['off_rating'] - teams['def_rating']

# Calculate net rating
teams['net_rating'] = (teams['point_diff'] / teams['poss']) * 100

# Calculate the pace
teams['pace'] = 40 * ((teams['Tm_Poss'] + teams['Opp_Poss']) / (2 * (teams['min'] / 5)))


# Calculate True Shooting Attempts (TSA)
teams['TSA'] = teams['o_fga'] + 0.44 * teams['o_fta']

# Calculate True Shooting Percentage (TS%)
teams['TS'] = teams['o_pts'] / (2 * teams['TSA'])

# Drop intermediate columns
teams = teams.drop(columns=['Tm_Poss', 'Opp_Poss', 'TSA'])

print(teams.columns)



Index(['year', 'tmID', 'franchID', 'confID', 'rank', 'playoff', 'firstRound',
       'semis', 'finals', 'name', 'o_fgm', 'o_fga', 'o_ftm', 'o_fta', 'o_3pm',
       'o_3pa', 'o_oreb', 'o_dreb', 'o_reb', 'o_asts', 'o_pf', 'o_stl', 'o_to',
       'o_blk', 'o_pts', 'd_fgm', 'd_fga', 'd_ftm', 'd_fta', 'd_3pm', 'd_3pa',
       'd_oreb', 'd_dreb', 'd_reb', 'd_asts', 'd_pf', 'd_stl', 'd_to', 'd_blk',
       'd_pts', 'won', 'lost', 'GP', 'homeW', 'homeL', 'awayW', 'awayL',
       'confW', 'confL', 'min', 'attend', 'arena', 'win_rate', 'point_diff',
       'poss', 'off_rating', 'def_rating', 'margin', 'net_rating', 'pace',
       'TS'],
      dtype='object')


In [147]:
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
from sklearn.model_selection import cross_val_score

# Assuming 'teams' DataFrame is already loaded and preprocessed

# Copy and preprocess data
proc_data = teams.copy()
proc_data = proc_data.drop('franchID', axis=1)
proc_data = proc_data.dropna()

# Define the testing year
testing_year = 6

# Split the data into training and testing sets
x_train = proc_data.loc[proc_data['year'] <= testing_year].drop('playoff', axis=1)
y_train = proc_data.loc[proc_data['year'] <= testing_year]['playoff'].astype('int')
x_test = proc_data.loc[proc_data['year'] > testing_year].drop('playoff', axis=1)
y_test = proc_data.loc[proc_data['year'] > testing_year]['playoff'].astype('int')

# Define models
models = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Extra Trees', ExtraTreesClassifier()),
    ('MLP', MLPClassifier(random_state=184, max_iter=1000)),
    ('KNN', KNeighborsClassifier()),
    ('SVM', SVC(probability=True, random_state=42)),  # Enable probability estimates for SVM
    ('Gaussian Naive Bayes', GaussianNB()),
    ('Bernoulli Naive Bayes', BernoulliNB()),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

results = []

# Train and evaluate models
for model_name, model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    y_proba = model.predict_proba(x_test)[:, 1]  # Get probability estimates for the positive class

    accuracy = accuracy_score(y_test, y_pred)
    logloss = log_loss(y_test, y_proba)
    roc_auc = roc_auc_score(y_test, y_proba)

    results.append((model_name, accuracy, logloss, roc_auc))

# Calculate baseline accuracy
baseline_accuracy = max(y_test.mean(), 1 - y_test.mean())
print(f"Baseline Accuracy: {baseline_accuracy:.2f}")

# Display results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Log Loss', 'ROC AUC'])
results_df = results_df.sort_values('Accuracy', ascending=False)
display(results_df)

# Use cross-validation to get a more robust estimate of model performance
for model_name, model in models:
    cv_scores = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')
    print(f"{model_name} Cross-Validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

Baseline Accuracy: 0.62


Unnamed: 0,Model,Accuracy,Log Loss,ROC AUC
7,Bernoulli Naive Bayes,0.666667,0.83815,0.644444
5,SVM,0.615385,0.681394,0.469444
1,Random Forest,0.589744,0.749468,0.591667
6,Gaussian Naive Bayes,0.589744,10.207099,0.519444
4,KNN,0.564103,0.800564,0.4625
8,Gradient Boosting,0.564103,1.537005,0.529167
0,Decision Tree,0.538462,16.635532,0.4625
2,Extra Trees,0.538462,0.781579,0.594444
3,MLP,0.307692,24.953299,0.255556


Decision Tree Cross-Validation Accuracy: 0.54 ± 0.11
Random Forest Cross-Validation Accuracy: 0.63 ± 0.06
Extra Trees Cross-Validation Accuracy: 0.61 ± 0.09
MLP Cross-Validation Accuracy: 0.46 ± 0.05
KNN Cross-Validation Accuracy: 0.57 ± 0.05
SVM Cross-Validation Accuracy: 0.57 ± 0.02
Gaussian Naive Bayes Cross-Validation Accuracy: 0.64 ± 0.08
Bernoulli Naive Bayes Cross-Validation Accuracy: 0.63 ± 0.08
Gradient Boosting Cross-Validation Accuracy: 0.53 ± 0.14
