# Chapter 3
# Predicting Sports Winners with Decision Trees

In [1]:
import pandas as pd

data_filename = "basketball.csv"
dataset = pd.read_csv(data_filename)

In [2]:
dataset.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Notes
0,Tue Oct 27 2015,8:00 pm,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,
1,Tue Oct 27 2015,8:00 pm,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,
2,Tue Oct 27 2015,10:30 pm,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,
3,Wed Oct 28 2015,7:30 pm,Philadelphia 76ers,95,Boston Celtics,112,Box Score,,
4,Wed Oct 28 2015,7:30 pm,Chicago Bulls,115,Brooklyn Nets,100,Box Score,,


In [3]:
dataset = pd.read_csv(data_filename, parse_dates=["Date"])

dataset.columns = ["Date", "Start (ET)", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type", "Notes"]

In [4]:
dataset.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Notes
0,2015-10-27,8:00 pm,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,
1,2015-10-27,8:00 pm,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,
2,2015-10-27,10:30 pm,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,
3,2015-10-28,7:30 pm,Philadelphia 76ers,95,Boston Celtics,112,Box Score,,
4,2015-10-28,7:30 pm,Chicago Bulls,115,Brooklyn Nets,100,Box Score,,


In [5]:
print(dataset.dtypes)

Date            datetime64[ns]
Start (ET)              object
Visitor Team            object
VisitorPts               int64
Home Team               object
HomePts                  int64
OT?                     object
Score Type              object
Notes                   object
dtype: object


In [6]:
dataset["HomeWin"] = dataset["VisitorPts"] < dataset["HomePts"]

In [7]:
dataset.head()

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Notes,HomeWin
0,2015-10-27,8:00 pm,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,,False
1,2015-10-27,8:00 pm,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,,True
2,2015-10-27,10:30 pm,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,,True
3,2015-10-28,7:30 pm,Philadelphia 76ers,95,Boston Celtics,112,Box Score,,,True
4,2015-10-28,7:30 pm,Chicago Bulls,115,Brooklyn Nets,100,Box Score,,,False


In [8]:
y_true = dataset["HomeWin"].values

In [9]:
dataset["HomeWin"].mean()

0.5942249240121581

In [10]:
from collections import defaultdict
won_last = defaultdict(int)

In [11]:
dataset["HomeLastWin"] = 0
dataset["VisitorLastWin"] = 0

In [12]:
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    dataset.set_value(index, "HomeLastWin", won_last[home_team])
    dataset.set_value(index, "VisitorLastWin", won_last[visitor_team])
    
    won_last[home_team] = int(row["HomeWin"])
    won_last[visitor_team] = 1 - int(row["HomeWin"])
    

In [13]:
dataset.head(6)

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Notes,HomeWin,HomeLastWin,VisitorLastWin
0,2015-10-27,8:00 pm,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,,False,0,0
1,2015-10-27,8:00 pm,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,,True,0,0
2,2015-10-27,10:30 pm,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,,True,0,0
3,2015-10-28,7:30 pm,Philadelphia 76ers,95,Boston Celtics,112,Box Score,,,True,0,0
4,2015-10-28,7:30 pm,Chicago Bulls,115,Brooklyn Nets,100,Box Score,,,False,0,1
5,2015-10-28,7:30 pm,Utah Jazz,87,Detroit Pistons,92,Box Score,,,True,1,0


In [14]:
dataset.ix[1000:1005]

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Notes,HomeWin,HomeLastWin,VisitorLastWin
1000,2016-03-15,7:00 pm,Denver Nuggets,110,Orlando Magic,116,Box Score,,,True,0,0
1001,2016-03-15,8:30 pm,Los Angeles Clippers,87,San Antonio Spurs,108,Box Score,,,True,1,0
1002,2016-03-16,7:00 pm,Oklahoma City Thunder,130,Boston Celtics,109,Box Score,,,False,0,1
1003,2016-03-16,7:00 pm,Orlando Magic,99,Charlotte Hornets,107,Box Score,,,True,0,1
1004,2016-03-16,7:00 pm,Dallas Mavericks,98,Cleveland Cavaliers,99,Box Score,,,True,0,1
1005,2016-03-16,7:30 pm,Atlanta Hawks,118,Detroit Pistons,114,Box Score,,,False,0,1


In [15]:
X_previouswins = dataset[["HomeLastWin", "VisitorLastWin"]].values

In [16]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)

In [17]:
from sklearn.cross_validation import cross_val_score
import numpy as np

In [18]:
scores = cross_val_score(clf, X_previouswins, y_true,
scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 59.4%


In [19]:
import os
standings_filename = os.path.join("standings.csv")

standings = pd.read_csv(standings_filename, skiprows=1)

In [20]:
standings.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Golden State Warriors,67-15,39-2,28-13,25-5,42-10,9-1,7-3,9-1,...,25-6,5-3,45-9,1-0,13-2,11-3,12-3,8-3,16-2,6-2
1,2,Atlanta Hawks,60-22,35-6,25-16,38-14,22-8,12-6,14-4,12-4,...,17-11,6-4,30-10,0-1,9-5,14-2,17-0,7-4,9-7,4-3
2,3,Houston Rockets,56-26,30-11,26-15,23-7,33-19,9-1,8-2,6-4,...,20-9,8-4,31-14,2-0,11-4,9-5,11-6,7-3,10-6,6-2
3,4,Los Angeles Clippers,56-26,30-11,26-15,19-11,37-15,7-3,6-4,6-4,...,21-7,3-5,33-9,2-0,9-5,11-6,11-4,5-6,11-5,7-0
4,5,Memphis Grizzlies,55-27,31-10,24-17,20-10,35-17,8-2,5-5,7-3,...,16-13,9-3,26-13,2-0,13-2,8-6,12-4,7-4,9-8,4-3


In [21]:
dataset["HomeTeamRanksHigher"] = 0
for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    dataset.set_value(index, "HomeTeamRanksHigher", int(home_rank < visitor_rank))

In [22]:
X_homehigher = dataset[[ "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin",]].values

In [23]:
clf = DecisionTreeClassifier(random_state=14, criterion="entropy")

scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')

print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 60.9%


In [24]:
last_match_winner = defaultdict(int)
dataset["HomeTeamWonLast"] = 0

for index, row in dataset.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    # Set in the row, who won the last encounter
    home_team_won_last = 1 if last_match_winner[teams] == row["Home Team"] else 0
    dataset.set_value(index, "HomeTeamWonLast", home_team_won_last)
    # Who won this one?
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner

In [25]:
dataset.ix[400:450]

Unnamed: 0,Date,Start (ET),Visitor Team,VisitorPts,Home Team,HomePts,OT?,Score Type,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeTeamRanksHigher,HomeTeamWonLast
400,2015-12-19,5:00 pm,Los Angeles Lakers,78,Oklahoma City Thunder,118,Box Score,,,True,0,0,1,0
401,2015-12-19,7:00 pm,Charlotte Hornets,101,Washington Wizards,109,Box Score,,,True,0,1,1,0
402,2015-12-20,1:00 pm,Minnesota Timberwolves,100,Brooklyn Nets,85,Box Score,,,False,0,1,1,0
403,2015-12-20,3:30 pm,Philadelphia 76ers,86,Cleveland Cavaliers,108,Box Score,,,True,1,0,1,1
404,2015-12-20,8:00 pm,New Orleans Pelicans,130,Denver Nuggets,125,Box Score,,,False,0,0,0,1
405,2015-12-20,1:00 pm,Portland Trail Blazers,109,Miami Heat,116,Box Score,,,True,0,0,0,0
406,2015-12-20,6:00 pm,Atlanta Hawks,103,Orlando Magic,100,Box Score,,,False,1,1,0,0
407,2015-12-20,5:00 pm,Milwaukee Bucks,101,Phoenix Suns,95,Box Score,,,False,1,0,0,0
408,2015-12-20,6:00 pm,Sacramento Kings,104,Toronto Raptors,94,Box Score,,,False,1,0,1,0
409,2015-12-21,8:00 pm,Portland Trail Blazers,97,Atlanta Hawks,106,Box Score,,,True,1,0,1,0


In [26]:
X_lastwinner = dataset[[ "HomeTeamWonLast", "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin",]].values
clf = DecisionTreeClassifier(random_state=14, criterion="entropy")

scores = cross_val_score(clf, X_lastwinner, y_true, scoring='accuracy')

print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 62.2%


In [27]:
from sklearn.preprocessing import LabelEncoder
encoding = LabelEncoder()
encoding.fit(dataset["Home Team"].values)
home_teams = encoding.transform(dataset["Home Team"].values)
visitor_teams = encoding.transform(dataset["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T

from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 62.8%


In [28]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 65.3%


In [29]:
X_all = np.hstack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 63.3%


In [30]:
X_all = np.hstack([X_lastwinner, X_teams])
clf = RandomForestClassifier(random_state=14, n_estimators=250)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 64.5%


In [31]:
from sklearn.grid_search import GridSearchCV
parameter_space = {
    "max_features": [2, 10, 'auto'],
    "n_estimators": [100, 200],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2, 4, 6],
}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))

Accuracy: 67.4%


In [32]:
print(grid.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=14, verbose=0, warm_start=False)
