# Chapter 3
# Predicting Sports Winners with Decision Trees

In [17]:
import pandas as pd

data_filename = "NBA 2014.csv"
dataset = pd.read_csv(data_filename)

In [18]:
# prints first five rows of csv
dataset.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,Sat Apr 19 2014,7:00 pm,Atlanta Hawks,101,Indiana Pacers,93,Box Score,,18165,
1,Sat Apr 19 2014,3:30 pm,Golden State Warriors,109,Los Angeles Clippers,105,Box Score,,19339,
2,Sat Apr 19 2014,9:43 pm,Memphis Grizzlies,86,Oklahoma City Thunder,100,Box Score,,18203,
3,Sat Apr 19 2014,12:30 am,Brooklyn Nets,94,Toronto Raptors,87,Box Score,,19800,
4,Sun Apr 20 2014,7:00 pm,Washington Wizards,102,Chicago Bulls,93,Box Score,,21694,


In [19]:
dataset = pd.read_csv(data_filename, parse_dates=["Date"])

dataset.columns = ["Date", "Start (ET)", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "Score Type", "OT?", "Attend.", "Notes"]

In [20]:
# prints first five rows of csv with converted date
dataset.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Attend.,Notes
0,2014-04-19,7:00 pm,Atlanta Hawks,101,Indiana Pacers,93,Box Score,,18165,
1,2014-04-19,3:30 pm,Golden State Warriors,109,Los Angeles Clippers,105,Box Score,,19339,
2,2014-04-19,9:43 pm,Memphis Grizzlies,86,Oklahoma City Thunder,100,Box Score,,18203,
3,2014-04-19,12:30 am,Brooklyn Nets,94,Toronto Raptors,87,Box Score,,19800,
4,2014-04-20,7:00 pm,Washington Wizards,102,Chicago Bulls,93,Box Score,,21694,


In [102]:
# prints datatypes of columns
print(dataset.dtypes)

Date            datetime64[ns]
Start (ET)              object
Visitor Team            object
VisitorPts               int64
Home Team               object
HomePts                  int64
Score Type              object
OT?                     object
Attend.                  int64
Notes                  float64
dtype: object


In [103]:
# generate additional column
# bool true if home team wins
dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"]

In [104]:
dataset.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Attend.,Notes,HomeWin
0,2014-04-19,7:00 pm,Atlanta Hawks,101,Indiana Pacers,93,Box Score,,18165,,False
1,2014-04-19,3:30 pm,Golden State Warriors,109,Los Angeles Clippers,105,Box Score,,19339,,False
2,2014-04-19,9:43 pm,Memphis Grizzlies,86,Oklahoma City Thunder,100,Box Score,,18203,,True
3,2014-04-19,12:30 am,Brooklyn Nets,94,Toronto Raptors,87,Box Score,,19800,,False
4,2014-04-20,7:00 pm,Washington Wizards,102,Chicago Bulls,93,Box Score,,21694,,False


In [105]:
# y_true -> np array with labels - which team is going to win
# train algorithm on previous season
# thing you want algorithm to guess based on features
# want algorithm to guess
y_true = dataset["HomeWin"].values

In [106]:
# mean of home wins -> false(loss) = 0, true(win) = 1
dataset["HomeWin"].mean()
# 56% of time the home team wins

0.5617977528089888

In [107]:
from collections import defaultdict
won_last = defaultdict(int)

In [108]:
dataset["HomeLastWin"] = 0
dataset["VisitorLastWin"] = 0

In [109]:
# Add column for previous wins
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    #dataset.set_value(index, "HomeLastWin", won_last[home_team])
    #dataset.set_value(index, "VisitorLastWin", won_last[visitor_team])
    dataset.loc[index, "HomeLastWin"] = won_last[home_team]
    dataset.loc[index, "VisitorLastWin"] = won_last[visitor_team]
    
    # conditional for last win
    won_last[home_team] = int(row["HomeWin"])
    won_last[visitor_team] = 1 - int(row["HomeWin"])
    

In [110]:
dataset.head(6)

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Attend.,Notes,HomeWin,HomeLastWin,VisitorLastWin
0,2014-04-19,7:00 pm,Atlanta Hawks,101,Indiana Pacers,93,Box Score,,18165,,False,0,0
1,2014-04-19,3:30 pm,Golden State Warriors,109,Los Angeles Clippers,105,Box Score,,19339,,False,0,0
2,2014-04-19,9:43 pm,Memphis Grizzlies,86,Oklahoma City Thunder,100,Box Score,,18203,,True,0,0
3,2014-04-19,12:30 am,Brooklyn Nets,94,Toronto Raptors,87,Box Score,,19800,,False,0,0
4,2014-04-20,7:00 pm,Washington Wizards,102,Chicago Bulls,93,Box Score,,21694,,False,0,0
5,2014-04-20,9:30 pm,Portland Trail Blazers,122,Houston Rockets,120,Box Score,OT,18240,,False,0,0


In [111]:
dataset.loc[30:35]

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Attend.,Notes,HomeWin,HomeLastWin,VisitorLastWin
30,2014-04-28,7:00 pm,Miami Heat,109,Charlotte Bobcats,98,Box Score,,19092,,False,0,1
31,2014-04-28,9:30 pm,San Antonio Spurs,93,Dallas Mavericks,89,Box Score,,20796,,False,1,0
32,2014-04-28,8:00 pm,Atlanta Hawks,107,Indiana Pacers,97,Box Score,,18165,,False,1,0
33,2014-04-29,8:00 pm,Washington Wizards,75,Chicago Bulls,69,Box Score,,21752,,False,0,1
34,2014-04-29,10:30 pm,Golden State Warriors,103,Los Angeles Clippers,113,Box Score,,19657,,True,0,1
35,2014-04-29,9:00 pm,Memphis Grizzlies,100,Oklahoma City Thunder,99,Box Score,OT,18203,,False,1,0


In [112]:
# Enter into decision tree learning algorithm
X_previouswins = dataset[["HomeLastWin", "VisitorLastWin"]].values
print(X_previouswins[20:25])

[[1 0]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]


In [113]:
from sklearn.tree import DecisionTreeClassifier
# clf = classifyer -> object running the classification, technically estimator
# estimator -> gets data and returns some type of stat result (ex: average)
clf = DecisionTreeClassifier(random_state=14)

In [114]:
from sklearn.model_selection import cross_val_score
import numpy as np

In [115]:
scores = cross_val_score(clf, X_previouswins, y_true,
scoring='accuracy')
print(scores)
# cv = 5 in scoring will impove preformance of classifyer
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

[0.61111111 0.72222222 0.72222222 0.5        0.52941176]
Accuracy: 61.7%


In [116]:
import os
standings_filename = os.path.join("Standing 2013.csv")
print(standings_filename)
standings = pd.read_csv(standings_filename, skiprows=1)

Standing 2013.csv


In [117]:
standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Miami Heat,66-16,37-4,29-12,41-11,25-5,14-4,12-6,15-1,...,30-2,9-3,39-8,1-0,10-3,10-5,8-5,12-1,17-1,8-1
1,2,Oklahoma City Thunder,60-22,34-7,26-15,21-9,39-13,7-3,8-2,6-4,...,21-8,3-6,44-6,,13-4,11-2,11-5,7-4,12-5,6-2
2,3,San Antonio Spurs,58-24,35-6,23-18,25-5,33-19,8-2,9-1,8-2,...,16-12,9-5,31-10,1-0,12-4,12-4,12-3,8-3,10-4,3-6
3,4,Denver Nuggets,57-25,38-3,19-22,19-11,38-14,5-5,10-0,4-6,...,24-4,11-7,28-8,0-1,8-8,9-6,12-3,8-4,13-2,7-1
4,5,Los Angeles Clippers,56-26,32-9,24-17,21-9,35-17,7-3,8-2,6-4,...,17-9,3-5,38-12,1-0,8-6,16-0,9-7,8-5,7-7,7-1


In [118]:
# creates column that compares home team to visiting team rank
dataset["HomeTeamRanksHigher"] = 0
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    dataset.loc[index, "HomeTeamRanksHigher"], int(home_rank < visitor_rank)

In [119]:
X_homehigher = dataset[[ "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin",]].values

In [120]:
clf = DecisionTreeClassifier(random_state=14, criterion="entropy")

scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')

print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 61.7%


In [121]:
last_match_winner = defaultdict(int)
dataset["HomeTeamWonLast"] = 0

for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    # Set in the row, who won the last encounter
    home_team_won_last = 1 if last_match_winner[teams] == row["Home Team"] else 0
    dataset.loc[index, "HomeTeamWonLast"], home_team_won_last
    # Who won this one?
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner

In [122]:
dataset.loc[40:45]

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Attend.,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeTeamRanksHigher,HomeTeamWonLast
40,2014-05-01,10:30 pm,Los Angeles Clippers,99,Golden State Warriors,100,Box Score,,19596,,True,0,1,0,0
41,2014-05-01,8:00 pm,Oklahoma City Thunder,104,Memphis Grizzlies,84,Box Score,,18119,,False,1,0,0,0
42,2014-05-02,7:00 pm,Toronto Raptors,83,Brooklyn Nets,97,Box Score,,17732,,True,0,1,0,0
43,2014-05-02,8:00 pm,San Antonio Spurs,111,Dallas Mavericks,113,Box Score,,20799,,True,0,1,0,0
44,2014-05-02,10:30 pm,Houston Rockets,98,Portland Trail Blazers,99,Box Score,,20204,,True,0,1,0,0
45,2014-05-03,5:30 pm,Atlanta Hawks,80,Indiana Pacers,92,Box Score,,18165,,True,1,0,0,0


In [123]:
X_lastwinner = dataset[[ "HomeTeamWonLast", "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin",]].values
clf = DecisionTreeClassifier(random_state=14, criterion="entropy")

scores = cross_val_score(clf, X_lastwinner, y_true, scoring='accuracy')

print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 61.7%


In [124]:
# Each team name will be a feature (number) - uses two transformers

from sklearn.preprocessing import LabelEncoder
encoding = LabelEncoder()
# estimator: generates map between names and numbers
# type: label encoder
encoding.fit(dataset["Home Team"].values)
# transformer: string in home team column into ints
home_teams = encoding.transform(dataset["Home Team"].values)
# transformer: string in visitor team column into ints
visitor_teams = encoding.transform(dataset["Visitor Team"].values)
# put two columns next to eachother
X_teams = np.vstack([home_teams, visitor_teams]).T

from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))


Accuracy: 58.5%


In [145]:
from sklearn.pipeline import Pipeline

"""
Create pipline for the code above
"""
    
new_pipeline = Pipeline([('Transformer_1', LabelEncoder()),
                ('Transformer_2', OneHotEncoder()),
                ('Estimator', DecisionTreeClassifier(random_state=14))])
scores = cross_val_score(new_pipeline, X_teams, y_true, scoring='accuracy', cv=5)
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: nan%


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
TypeError: fit_transform() takes 2 positional arguments but 3 were given



In [140]:
"""
Pipeline(steps=[('Estimator', encoding.fit(dataset["Home Team"].values)),
                ('Home_Teams Transformer (Label_Encoder)', encoding.transform(dataset["Home Team"].values)),
                ('Visitor_Teams Transformer (Label_Encoder)', encoding.transform(dataset["Visitor Team"].values)),
                ('Encoder', onehot.fit_transform(X_teams).todense())])
            
new_pipeline = Pipeline([('Estimator', encoding.fit(dataset["Home Team"].values)),
                       ('Home_Teams Transformer (Label_Encoder)', encoding.transform(dataset["Home Team"].values)),
                      ('Visitor_Teams Transformer (Label_Encoder)', encoding.transform(dataset["Visitor Team"].values)),
                      ('Encoder', onehot.fit_transform(X_teams).todense())])
"""

'\nPipeline(steps=[(\'Estimator\', encoding.fit(dataset["Home Team"].values)),\n                (\'Home_Teams Transformer (Label_Encoder)\', encoding.transform(dataset["Home Team"].values)),\n                (\'Visitor_Teams Transformer (Label_Encoder)\', encoding.transform(dataset["Visitor Team"].values)),\n                (\'Encoder\', onehot.fit_transform(X_teams).todense())])\n            \nnew_pipeline = Pipeline([(\'Estimator\', encoding.fit(dataset["Home Team"].values)),\n                       (\'Home_Teams Transformer (Label_Encoder)\', encoding.transform(dataset["Home Team"].values)),\n                      (\'Visitor_Teams Transformer (Label_Encoder)\', encoding.transform(dataset["Visitor Team"].values)),\n                      (\'Encoder\', onehot.fit_transform(X_teams).todense())])\n'

Accuracy: nan%


In [40]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 59.6%


In [41]:
# last winner in addition to onehotencoder
X_all = np.hstack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 64.1%


In [42]:
# Give RandomForestClassifier more trees (100 default), 250 trees
X_all = np.hstack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14, n_estimators=250)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 62.9%


In [43]:
# Play with different parameters to see which preformance is best
from sklearn.model_selection import GridSearchCV
parameter_space = {
    "max_features": [2, 10, 'auto'],
    "n_estimators": [100, 200],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2, 4, 6],
}
clf = RandomForestClassifier(random_state=14)
print(clf)
grid = GridSearchCV(clf, parameter_space)
print(grid)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))

RandomForestClassifier(random_state=14)
GridSearchCV(estimator=RandomForestClassifier(random_state=14),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': [2, 10, 'auto'],
                         'min_samples_leaf': [2, 4, 6],
                         'n_estimators': [100, 200]})
Accuracy: 66.3%


In [45]:
# Gives the best parameters from parameter space that have the best accuracy 
print(grid.best_estimator_)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))

RandomForestClassifier(criterion='entropy', max_features=10, min_samples_leaf=2,
                       random_state=14)
Accuracy: 66.3%
