## Model to predict teams that will qualify for the playoffs

# I chose Decision trees to handle this problem because 
- It is an easy model and can be easily interpreted(it not seen as black boxes as opposed to neural networks)
- It is easily scalable and relatively quick prediction time(log(n))
- It easily captures non-linear relationships
- It performed much better than other models I tried out including: Logistic regression, KNN, gradientboosting classifier, svms and random forests.

The model below is guaranteed to predict at least 11 teams correctly out of 16  teams when trained and has accurately predicted 12 teams out of 16 teams at its best.

The dataset lacks data about teams that made it to the finals from 1977 -2015. Hence there is isnt sufficient data to properly learn the trends required to predict and nba final. I tried to compensate for that using boot strapping impplemented via smote and randomoversampler. This helped to boost the model performance but cannot be compared to the effect actual correct data would have on the model.

In [1]:
import pickle
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [3]:
with open("general_train_data.pkl", "rb") as f:
    general_train_data = pickle.load(f)

In [4]:
with open("eastern_train_data", "rb") as f:
    eastern_train_data = pickle.load(f)
with open("western_train_data", "rb") as f:
    western_train_data = pickle.load(f)
with open("western_test_data", "rb") as f:
    western_test_data = pickle.load(f)
with open("eastern_test_data", "rb") as f:
    eastern_test_data = pickle.load(f)

In [5]:
# with open("eastern_train_data", "wb") as f:
#     pickle.dump(eastern_train_data,f)
# with open("western_train_data", "wb") as f:
#     pickle.dump(western_train_data,f)
# with open("western_test_data", "wb") as f:
#     pickle.dump(western_test_data, f)
# with open("eastern_test_data", "wb") as f:
#     pickle.dump(eastern_test_data, f)

In [6]:
with open("test_data", "rb") as f:
    test_data = pickle.load(f)

In [12]:
# encoder = LabelEncoder()
# encoder.fit(general_train_data["franch_id"])

LabelEncoder()

In [8]:
# with open("encoder.pkl", "wb") as f:
#     pickle.dump(encoder, f)

In [9]:
with open("encoder.pkl", "rb") as f:
    encoder = pickle.load(f)

In [15]:
#I commented this out because I have already saved the label encoded version of this
# #Converting the franchise column to ordinal numbers so it can be processed by the ml model
# #East train
# eastern_train_data["franch_id"] = encoder.transform(eastern_train_data["franch_id"])
# #west train
# western_train_data["franch_id"] = encoder.transform(western_train_data["franch_id"])
# #East test
# eastern_test_data["franch_id"] = encoder.transform(eastern_test_data["franch_id"])
# #West test
# western_test_data["franch_id"] = encoder.transform(western_test_data["franch_id"])

In [16]:
#Extracting predictor and target variable for the train sets
y_east = eastern_train_data["playoffs"]
y_west = western_train_data["playoffs"]
x_east = eastern_train_data.drop("playoffs", axis = 1)
x_west = western_train_data.drop("playoffs", axis = 1)

In [17]:
#Trying out a decision tree model to check if the features are relevant enough to make a good prediction for the target
from sklearn.model_selection import train_test_split
x_train_east, x_test_east, y_train_east, y_test_east = train_test_split(x_east, y_east, test_size=0.3, random_state=0)
x_train_west, x_test_west, y_train_west, y_test_west = train_test_split(x_west, y_west, test_size=0.3, random_state=0)

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
rs_dt_east = RandomizedSearchCV(model,
                 {"max_depth": range(50, 80),
                 "min_samples_split": range(5, 15, 1)},
                 cv = 5,
                 n_jobs=2,
                 verbose = 1)
rs_dt_east.fit(x_train_east, y_train_east)
rs_dt_west = RandomizedSearchCV(model,
                 {"max_depth": range(80, 100),
                 "min_samples_split": range(5, 15, 1)},
                 cv = 5,
                 n_jobs=2,
                 verbose = 1)
rs_dt_west.fit(x_train_west, y_train_west)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=2,
                   param_distributions={'max_depth': range(80, 100),
                                        'min_samples_split': range(5, 15)},
                   verbose=1)

In [19]:
print(rs_dt_east.best_estimator_)
print(rs_dt_west.best_estimator_)

DecisionTreeClassifier(max_depth=76, min_samples_split=14)
DecisionTreeClassifier(max_depth=88, min_samples_split=13)


In [20]:
model_east_po = rs_dt_east.best_estimator_
model_west_po = rs_dt_west.best_estimator_

In [21]:
model_east_po.fit(x_train_east, y_train_east)
model_west_po.fit(x_train_west, y_train_west)

DecisionTreeClassifier(max_depth=88, min_samples_split=13)

In [22]:
# with open("model_east_po.pkl", "wb")as f:
#     pickle.dump(model_east_po, f)
# with open("model_west_po.pkl", "wb") as f:
#     pickle.dump(model_west_po, f)

In [23]:
# with open("model_east_po.pkl", "rb") as f:
#     model_east_po = pickle.load(f)
# with open("model_west_po.pkl", "rb") as f:
#     model_west_po = pickle.load(f)

In [24]:
from sklearn.metrics import precision_score
y_pred_train_east = model_east_po.predict(x_train_east)
print(precision_score(y_pred_train_east, y_train_east))
y_pred_train_west = model_west_po.predict(x_train_west)
precision_score(y_pred_train_west, y_train_west)

0.9587628865979382


0.95

In [25]:
y_pred_test_east = model_east_po.predict(x_test_east)
print(precision_score(y_pred_test_east, y_test_east))
y_pred_test_west = model_west_po.predict(x_test_west)
precision_score(y_pred_test_west, y_test_west)

0.8777777777777778


0.8144329896907216

In [26]:
def make_predictions(model, data, encoder_test):
    predictions = model.predict_proba(data)
    one_pred = predictions[:,1]
    team_indices = np.argsort(one_pred)[:8]
    team_index = data.iloc[team_indices]
    teams = encoder_test.classes_
    return list(teams[list(team_index["franch_id"])])

In [27]:
eastern_pred_po = make_predictions(model_east_po, eastern_test_data, encoder)

Feature names must be in the same order as they were in fit.



In [28]:
western_pred_po = make_predictions(model_west_po, western_test_data, encoder)

Feature names must be in the same order as they were in fit.



In [29]:
eastern_pred_po

['ATL', 'BOS', 'CHI', 'CLE', 'DET', 'IND', 'MIA', 'MIL']

In [30]:
western_pred_po

['LAL', 'NOP', 'DAL', 'DEN', 'GSW', 'HOU', 'LAC', 'MEM']

In [31]:
#These are the teams the model predicted that would qualify for the nba playoffs
western_po = list(encoder.transform(western_pred_po))
eastern_po = list(encoder.transform(eastern_pred_po))

In [32]:
western_po

[34, 48, 16, 17, 24, 25, 33, 36]

In [33]:
western_test_data[western_test_data["franch_id"].apply(lambda x : x in western_po)]#.loc[western_po]

Unnamed: 0,season,franch_id,neutral,elo1_pre,elo2_pre,elo_prob1,elo_prob2,elo1_post,elo2_post,score1,score2,quality
6,2023,16,0.0,1543.655056,1496.133569,0.687089,0.312911,1540.913724,1498.874901,115.365854,112.682927,58.02439
7,2023,17,0.0,1585.803211,1527.19431,0.705811,0.294189,1578.402525,1514.35702,118.505173,108.929981,69.283019
9,2023,24,0.0,1575.549509,1506.932117,0.711477,0.288523,1577.011189,1505.470437,119.297872,111.574468,58.404255
10,2023,25,0.0,1321.387748,1508.327249,0.383133,0.616867,1320.376347,1509.33865,110.756098,115.146341,14.073171
12,2023,33,0.0,1507.533222,1508.942769,0.630669,0.369331,1505.092568,1511.383422,112.697674,111.046512,51.930233
13,2023,34,0.0,1498.905945,1520.431951,0.605876,0.394124,1499.988412,1519.349484,116.62,111.92,51.84
14,2023,36,0.0,1600.897236,1500.431581,0.747963,0.252037,1603.041427,1498.28739,119.136364,109.022727,61.818182
18,2023,48,0.0,1547.153536,1515.278892,0.670235,0.329765,1546.358428,1516.074,115.0,110.238095,53.214286


In [34]:
western_po

[34, 48, 16, 17, 24, 25, 33, 36]

In [35]:
#Teams data that qualified for the playoffs
western_test_data_po = western_test_data[western_test_data["franch_id"].apply(lambda x : x in western_po)]
eastern_test_data_po = eastern_test_data[eastern_test_data["franch_id"].apply(lambda x : x in eastern_po)]

## Model to predict teams that will reach the finals

In [36]:
with open("C:/Users\DELL  LATITUDE E5480\Documents\Job Interview/model_west_po.pkl", "rb") as f:
    playoff_pred_west = pickle.load(f)
with open("western_train_data_final.pkl", "rb") as f:
    western_train_data_final = pickle.load(f)
with open("eastern_train_data_final.pkl", "rb") as f:
    eastern_train_data_final = pickle.load(f)

In [37]:
#There is a lot of missing data concerning teams that made it to the finals. Hence it is largely affecting model perfomance
#The dataset only show teams that made it to finals from 2016
#It is best to correct and update the dataset for better predictions
western_train_data_final[western_train_data_final["finals"] == 1] 

Unnamed: 0,season,neutral,franch_id,elo1_pre,elo2_pre,elo_prob1,elo_prob2,elo1_post,elo2_post,score1,score2,quality,finals
889,2016,0.0,GSW,1791.092808,1669.293118,0.769306,0.230694,1791.952863,1668.433063,108.714286,97.142857,99.142857,1
895,2016,0.0,OKC,1718.021353,1695.349642,0.653899,0.346101,1722.425126,1690.945869,109.111111,95.777778,95.666667,1
903,2017,0.0,GSW,1813.943485,1647.475909,0.821607,0.178393,1818.098158,1643.321236,119.444444,102.555556,99.333333,1
911,2017,0.0,SAS,1655.732227,1629.575573,0.655006,0.344994,1653.317946,1631.989854,109.5,105.625,89.125,1
917,2018,0.0,GSW,1644.171153,1633.129583,0.650611,0.349389,1652.848726,1624.45201,114.909091,98.909091,91.272727,1
918,2018,0.0,HOU,1740.487448,1639.429509,0.755445,0.244555,1738.537754,1641.379203,108.1,102.0,98.2,1
935,2019,0.0,GSW,1670.549712,1655.339278,0.654366,0.345634,1667.465186,1658.423803,112.454545,111.181818,97.090909,1
943,2019,0.0,POR,1664.42733,1619.350891,0.69287,0.30713,1663.383803,1620.394418,115.375,112.25,91.25,1
950,2020,1.0,DEN,1554.290407,1618.302426,0.410451,0.589549,1556.993554,1615.599279,106.888889,106.777778,94.111111,1
954,2020,1.0,LAL,1649.662961,1585.703585,0.590531,0.409469,1653.880489,1581.486057,113.666667,106.166667,93.916667,1


In [38]:
eastern_train_data_final[eastern_train_data_final["finals"] == 1] 

Unnamed: 0,season,neutral,franch_id,elo1_pre,elo2_pre,elo_prob1,elo_prob2,elo1_post,elo2_post,score1,score2,quality,finals
886,2016,0.0,CLE,1689.238734,1649.16172,0.684469,0.315531,1696.490993,1641.909461,111.1,93.2,94.1,1
898,2016,0.0,TOR,1610.606054,1609.064231,0.636727,0.363273,1611.229755,1608.44053,97.909091,94.545455,87.454545,1
900,2017,0.0,BOS,1595.237952,1585.143165,0.650796,0.349204,1588.275258,1592.10586,108.9,113.2,82.6,1
902,2017,0.0,CLE,1616.263283,1635.552506,0.60712,0.39288,1621.265521,1630.550269,117.125,108.875,83.25,1
915,2018,0.0,BOS,1582.331951,1584.346856,0.634778,0.365222,1588.062474,1578.616334,106.0,96.272727,79.818182,1
916,2018,0.0,CLE,1569.431937,1616.721126,0.574524,0.425476,1571.550816,1614.602247,103.545455,99.272727,82.272727,1
939,2019,0.0,MIL,1693.108131,1609.545122,0.736895,0.263105,1693.368523,1609.28473,112.75,99.75,93.25,1
945,2019,0.0,TOR,1676.714064,1646.828898,0.675274,0.324726,1679.292031,1644.25093,108.153846,98.615385,98.615385,1
947,2020,1.0,BOS,1689.576092,1648.79887,0.557906,0.442094,1689.328213,1649.046749,111.375,107.75,99.25,1
955,2020,1.0,MIA,1628.548891,1656.537678,0.460058,0.539942,1632.252381,1652.834188,110.0,107.1,94.6,1


In [39]:
#East train finals
eastern_train_data_final["franch_id"] = encoder.transform(eastern_train_data_final["franch_id"])
#West train finals
western_train_data_final["franch_id"] = encoder.transform(western_train_data_final["franch_id"])

In [40]:
#Extracting predictor and target variable for the train sets
y_east_final = eastern_train_data_final["finals"]
y_west_final = western_train_data_final["finals"]
x_east_final = eastern_train_data_final.drop("finals", axis = 1)
x_west_final = western_train_data_final.drop("finals", axis = 1)

In [41]:
#Handling the class imbalance of the training data
#East training data
from imblearn.over_sampling import SMOTE
over = SMOTE()
X, y = over.fit_resample(x_east_final, y_east_final)
from imblearn.under_sampling import RandomUnderSampler
under = RandomUnderSampler()
x_east_final, y_east_final = under.fit_resample(X, y)

In [42]:
#Handling class imbalance of the west training data
#West training data
over = SMOTE()
X, y = over.fit_resample(x_west_final, y_west_final)
under = RandomUnderSampler()
x_west_final, y_west_final = under.fit_resample(X, y)

In [43]:
#Trying out a decision tree model to check if the features are relevant enough to make a good prediction for the target
from sklearn.model_selection import train_test_split
x_train_east_po, x_test_east_po, y_train_east_po, y_test_east_po = train_test_split(x_east_final, y_east_final, test_size=0.3, random_state=0)
x_train_west_po, x_test_west_po, y_train_west_po, y_test_west_po = train_test_split(x_west_final, y_west_final, test_size=0.3, random_state=0)

In [44]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
rs_dt_east = RandomizedSearchCV(model,
                 {"max_depth": range(100, 120),
                 "min_samples_split": range(5, 15, 1)},
                 cv = 5,
                 n_jobs=2,
                 verbose = 1)
rs_dt_east.fit(x_train_east_po, y_train_east_po)
rs_dt_west = RandomizedSearchCV(model,
                 {"max_depth": range(100, 120),
                 "min_samples_split": range(5, 15, 1)},
                 cv = 5,
                 n_jobs=2,
                 verbose = 1)
rs_dt_west.fit(x_train_west_po, y_train_west_po)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=2,
                   param_distributions={'max_depth': range(100, 120),
                                        'min_samples_split': range(5, 15)},
                   verbose=1)

In [45]:
print(rs_dt_east.best_estimator_)
print(rs_dt_west.best_estimator_)

DecisionTreeClassifier(max_depth=105, min_samples_split=14)
DecisionTreeClassifier(max_depth=101, min_samples_split=5)


In [46]:
model_east_finals = rs_dt_east.best_estimator_
model_west_finals = rs_dt_west.best_estimator_

In [47]:
model_east_finals.fit(x_train_east_po, y_train_east_po)
model_west_finals.fit(x_train_west_po, y_train_west_po)

DecisionTreeClassifier(max_depth=101, min_samples_split=5)

In [48]:
# with open("model_east_finals.pkl", "wb") as f:
#     pickle.dump(model_east_finals, f)
# with open("model_west_finals.pkl", "wb") as f:
#     pickle.dump(model_west_finals, f)

In [49]:
from sklearn.metrics import precision_score
y_pred_train_east = model_east_finals.predict(x_train_east_po)
print(precision_score(y_pred_train_east, y_train_east_po))
y_pred_train_west = model_west_finals.predict(x_train_west_po)
precision_score(y_pred_train_west, y_train_west_po)

0.9918433931484503


0.9981981981981982

In [50]:
y_pred_test_east = model_east_finals.predict(x_test_east_po)
print(precision_score(y_pred_test_east, y_test_east_po))
y_pred_test_west = model_west_finals.predict(x_test_west_po)
precision_score(y_pred_test_west, y_test_west_po)

0.9961977186311787


1.0

In [51]:
def make_predictions(model, data, encoder_test):
    predictions = model.predict_proba(data)
    one_pred = predictions[:,1]
    team_indices = np.argsort(one_pred)[:1]
    team_index = data.iloc[team_indices]
    teams = encoder_test.classes_
    return list(teams[list(team_index["franch_id"])])

In [52]:
eastern_final_pred = make_predictions(model_east_finals, eastern_test_data_po, encoder)

Feature names must be in the same order as they were in fit.



In [53]:
eastern_final_pred

['ATL']

In [54]:
western_final_pred = make_predictions(model_west_finals, western_test_data_po, encoder)

Feature names must be in the same order as they were in fit.



In [55]:
western_final_pred

['DAL']

### The model predicts that "ATL" and "DAL" will reach the nba finals

# Eventual winners of the nba

In [56]:
# with open("general_train_data.pkl", "rb") as f:
#     general_train_data = pickle.load(f)

In [57]:
general_train_data.head()

Unnamed: 0,season,neutral,franch_id,elo1_pre,elo2_pre,elo_prob1,elo_prob2,elo1_post,elo2_post,score1,score2,quality,finals
0,1977,0.0,ATL,1375.606346,1503.551354,0.460603,0.539397,1374.624802,1504.532895,101.829268,104.268293,21.097561,0
1,1977,0.0,BOS,1503.015685,1503.181737,0.6308,0.3692,1502.305171,1503.892246,106.463415,104.731707,47.463415,0
2,1977,0.0,CHI,1464.97291,1504.132256,0.580926,0.419074,1468.476712,1500.628441,100.365854,95.097561,39.073171,0
3,1977,0.0,CLE,1538.177666,1505.280859,0.676215,0.323785,1539.055212,1504.403312,104.731707,99.341463,55.414634,0
4,1977,0.0,DEN,1615.548263,1494.947332,0.775216,0.224784,1618.574702,1491.920888,116.682927,103.585366,68.634146,0


In [58]:
general_train_data["franch_id"] = encoder.transform(general_train_data["franch_id"])

In [59]:
y_final = general_train_data["finals"]
x_final = general_train_data.drop("finals", axis=1)

In [60]:
#Handling class imbalance of the west training data
#West training data
over = SMOTE()
X, y = over.fit_resample(x_final, y_final)
under = RandomUnderSampler()
x_final, y_final = under.fit_resample(X, y)

In [61]:
x_train_final, x_test_final, y_train_final, y_test_final = train_test_split(x_final, y_final, test_size=0.3, random_state=0)

In [62]:
#These are the teams the model predicted that would qualify for the nba playoffs
western_final = list(encoder.transform(western_final_pred))
eastern_final = list(encoder.transform(eastern_final_pred))

In [63]:
western_final

[16]

In [64]:
#Teams data that qualified for the playoffs
western_test_data_final = western_test_data[western_test_data["franch_id"].apply(lambda x : x in western_final)]
eastern_test_data_final = eastern_test_data[eastern_test_data["franch_id"].apply(lambda x : x in eastern_final)]

In [65]:
two_teams = pd.concat((eastern_test_data_final, western_test_data_final))

In [66]:
model = DecisionTreeClassifier()
rs_winner = RandomizedSearchCV(model,
                 {"max_depth": range(120, 150),
                 "min_samples_split": range(5, 15, 1)},
                 cv = 5,
                 n_jobs=2,
                 verbose = 1)
rs_winner.fit(x_train_final, y_train_final)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=2,
                   param_distributions={'max_depth': range(120, 150),
                                        'min_samples_split': range(5, 15)},
                   verbose=1)

In [67]:
final_model = rs_winner.best_estimator_
final_model.fit(x_train_final, y_train_final)

DecisionTreeClassifier(max_depth=120, min_samples_split=13)

In [68]:
# with open("final_model.pkl", "wb") as f:
#     pickle.dump(final_model, f)

In [69]:
from sklearn.metrics import precision_score
y_pred_final = final_model.predict(x_train_final)
print(precision_score(y_pred_final, y_train_final))

0.9985845718329794


In [70]:
y_pred_test_final = final_model.predict(x_test_final)
print(precision_score(y_pred_test_final, y_test_final))

0.993431855500821


In [71]:
def make_predictions_final(model, data, encoder_test):
    predictions = model.predict_proba(data)
    one_pred = predictions[:,1]
    team_indices = np.argsort(one_pred)[:1]
    team_index = data.iloc[team_indices]
    teams = encoder_test.classes_
    return list(teams[list(team_index["franch_id"])])

In [72]:
final_winner = make_predictions_final(final_model, two_teams, encoder)

Feature names must be in the same order as they were in fit.



In [73]:
final_winner

['ATL']

# Generate a Random team and predict their perfomance