In [1]:
import os
import numpy as np
import pandas as pd
data_filename = "/home/corey/Dropbox/SportsData/NBA2021Season.csv"

In [3]:
results = pd.read_csv(data_filename)
results.head()

Unnamed: 0,Date,Score Type,Visitor/Neutral,VisitorPTS,Home/Neutral,HomePTS
0,2020-12-22 00:00:00,Box Score,Golden State Warriors,99,Brooklyn Nets,125
1,2020-12-22 00:00:00,Box Score,Los Angeles Clippers,116,Los Angeles Lakers,109
2,2020-12-23 00:00:00,Box Score,Charlotte Hornets,114,Cleveland Cavaliers,121
3,2020-12-23 00:00:00,Box Score,New York Knicks,107,Indiana Pacers,121
4,2020-12-23 00:00:00,Box Score,Miami Heat,107,Orlando Magic,113


In [8]:
# Don't read the first row, as it is blank, and parse the date column as a date
results = pd.read_csv(data_filename, parse_dates=["Date"]) #skiprows=[0,])
# Fix the name of the columns
results.columns = ["Date", "Score Type", "Visitor Team", "VisitorPts", "Home Team", "HomePts"]

results.head()

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts
0,2020-12-22,Box Score,Golden State Warriors,99,Brooklyn Nets,125
1,2020-12-22,Box Score,Los Angeles Clippers,116,Los Angeles Lakers,109
2,2020-12-23,Box Score,Charlotte Hornets,114,Cleveland Cavaliers,121
3,2020-12-23,Box Score,New York Knicks,107,Indiana Pacers,121
4,2020-12-23,Box Score,Miami Heat,107,Orlando Magic,113


In [9]:
results["HomeWin"] = results["VisitorPts"] < results["HomePts"]
# Our "class values"
y_true = results["HomeWin"].values
results.head()

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,HomeWin
0,2020-12-22,Box Score,Golden State Warriors,99,Brooklyn Nets,125,True
1,2020-12-22,Box Score,Los Angeles Clippers,116,Los Angeles Lakers,109,False
2,2020-12-23,Box Score,Charlotte Hornets,114,Cleveland Cavaliers,121,True
3,2020-12-23,Box Score,New York Knicks,107,Indiana Pacers,121,True
4,2020-12-23,Box Score,Miami Heat,107,Orlando Magic,113,True


In [10]:
print("Home Win percentage: {0:.1f}%".format(100 * results["HomeWin"].sum() / results["HomeWin"].count()))

Home Win percentage: 53.8%


In [11]:
results["HomeLastWin"] = False
results["VisitorLastWin"] = False
# This creates two new columns, all set to False
results.head()

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,HomeWin,HomeLastWin,VisitorLastWin
0,2020-12-22,Box Score,Golden State Warriors,99,Brooklyn Nets,125,True,False,False
1,2020-12-22,Box Score,Los Angeles Clippers,116,Los Angeles Lakers,109,False,False,False
2,2020-12-23,Box Score,Charlotte Hornets,114,Cleveland Cavaliers,121,True,False,False
3,2020-12-23,Box Score,New York Knicks,107,Indiana Pacers,121,True,False,False
4,2020-12-23,Box Score,Miami Heat,107,Orlando Magic,113,True,False,False


In [17]:
# Now compute the actual values for these
# Did the home and visitor teams win their last game?
from collections import defaultdict
won_last = defaultdict(int)

for index, row in results.iterrows():  # Note that this is not efficient
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    row["VisitorLastWin"] = won_last[visitor_team]
    results.iloc[index] = row    
    # Set current win
    won_last[home_team] = row["HomeWin"]
    won_last[visitor_team] = not row["HomeWin"]
results.iloc[20:25]

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,HomeWin,HomeLastWin,VisitorLastWin,0,...,1026,1027,1028,1029,1030,1031,1032,1033,1034,1035
20,2020-12-26,Box Score,Oklahoma City Thunder,109,Charlotte Hornets,107,False,False,0,,...,,,,,,,,,,
21,2020-12-26,Box Score,Cleveland Cavaliers,128,Detroit Pistons,119,False,False,True,,...,,,,,,,,,,
22,2020-12-26,Box Score,Orlando Magic,130,Washington Wizards,120,False,False,True,,...,,,,,,,,,,
23,2020-12-26,Box Score,Philadelphia 76ers,109,New York Knicks,89,False,False,True,,...,,,,,,,,,,
24,2020-12-26,Box Score,Indiana Pacers,125,Chicago Bulls,106,False,False,True,,...,,,,,,,,,,


In [18]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)

In [22]:
from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score

# Create a dataset with just the neccessary information
X_previouswins = results[["HomeLastWin", "VisitorLastWin"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_previouswins, y_true, scoring='accuracy')
print("Using just the last result from the home and visitor teams")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using just the last result from the home and visitor teams
Accuracy: 52.5%


In [23]:
# What about win streaks?
results["HomeWinStreak"] = 0
results["VisitorWinStreak"] = 0
# Did the home and visitor teams win their last game?
from collections import defaultdict
win_streak = defaultdict(int)

for index, row in results.iterrows():  # Note that this is not efficient
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeWinStreak"] = win_streak[home_team]
    row["VisitorWinStreak"] = win_streak[visitor_team]
    results.loc[index] = row    
    # Set current win
    if row["HomeWin"]:
        win_streak[home_team] += 1
        win_streak[visitor_team] = 0
    else:
        win_streak[home_team] = 0
        win_streak[visitor_team] += 1

In [24]:
clf = DecisionTreeClassifier(random_state=14)
X_winstreak =  results[["HomeLastWin", "VisitorLastWin", "HomeWinStreak", "VisitorWinStreak"]].values
scores = cross_val_score(clf, X_winstreak, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
Accuracy: 52.4%


In [28]:
# Let's try see which team is better on the ladder. Using the previous year's ladder
ladder_filename = "/home/corey/Dropbox/SportsData/nba2019Standings.csv"
ladder = pd.read_csv(ladder_filename, skiprows=[0,0])
ladder

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,â‰¤3,â‰¥10,Oct,Nov,Dec,Jan,Feb,Mar,Jul,Aug
0,1,Milwaukee Bucks,56-17,30-5,26-12,37-7,19-10,10-4,13-1,14-2,...,3-2,38-8,2-2,15-1,13-2,11-2,10-1,2-4,1-0,2-5
1,2,Toronto Raptors,53-19,26-10,27-9,34-11,19-8,9-5,13-3,12-3,...,6-4,23-10,4-1,10-3,9-7,12-3,7-3,4-1,,7-1
2,3,Los Angeles Lakers,52-19,25-10,27-9,16-9,36-10,5-5,4-3,7-1,...,7-3,25-11,3-1,14-1,9-5,10-4,9-2,4-1,1-0,2-5
3,4,Los Angeles Clippers,49-23,27-9,22-14,17-7,32-16,5-3,4-3,8-1,...,8-3,32-8,4-2,10-4,10-5,9-4,7-4,4-1,0-1,5-2
4,5,Boston Celtics,48-24,26-10,22-14,30-13,18-11,9-6,9-4,12-3,...,6-5,29-7,3-1,10-4,10-3,9-7,9-3,2-3,0-1,5-2
5,6,Denver Nuggets,46-27,26-11,20-16,17-11,29-16,6-4,4-4,7-3,...,9-5,16-13,3-2,10-2,10-6,11-5,6-4,3-3,,3-5
6,7,Indiana Pacers,45-28,25-11,20-17,28-19,17-9,9-7,8-7,11-5,...,5-7,20-12,1-3,11-4,10-5,9-5,5-7,3-2,,6-2
7,8,Houston Rockets,44-28,24-12,20-16,16-9,28-19,6-3,4-3,6-3,...,4-5,22-14,3-1,10-5,10-5,7-7,9-2,1-4,1-0,3-4
8,9,Miami Heat,44-29,29-7,15-22,30-13,14-16,9-6,11-3,10-4,...,6-4,23-11,4-1,9-4,11-4,8-6,6-7,3-2,,3-5
9,10,Oklahoma City Thunder,44-28,23-14,21-14,17-9,27-19,5-3,6-4,6-2,...,8-6,18-10,1-4,6-7,11-4,12-5,7-3,3-1,,4-4


In [34]:
# We can create a new feature -- HomeTeamRanksHigher\
#ladder.reset_index(inplace = True)
results["HomeTeamRanksHigher"] = 0
for index, row in results.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    if home_team == "New Orleans Pelicans":
        home_team = "New Orleans Hornets"
    elif visitor_team == "New Orleans Pelicans":
        visitor_team = "New Orleans Hornets"
    home_rank = ladder[ladder["Team"] == home_team]["Rk"].values[0]
    visitor_rank = ladder[ladder["Team"] == visitor_team]["Rk"].values[0]
    row["HomeTeamRanksHigher"] = int(home_rank > visitor_rank)
    results.loc[index] = row
results[:5]

IndexError: index 0 is out of bounds for axis 0 with size 0

In [35]:
X_homehigher =  results[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
Accuracy: 52.5%


In [38]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
#from sklearn.grid_search import GridSearchCV

parameter_space = {
                   "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
                   }
clf = DecisionTreeClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_homehigher, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))

Accuracy: 52.7%


In [39]:
# Who won the last match? We ignore home/visitor for this bit
last_match_winner = defaultdict(int)
results["HomeTeamWonLast"] = 0

for index, row in results.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    # Set in the row, who won the last encounter
    row["HomeTeamWonLast"] = 1 if last_match_winner[teams] == row["Home Team"] else 0
    results.loc[index] = row
    # Who won this one?
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner
results.loc[:5]

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,HomeWin,HomeLastWin,VisitorLastWin,0,...,1030,1031,1032,1033,1034,1035,HomeWinStreak,VisitorWinStreak,HomeTeamRanksHigher,HomeTeamWonLast
0,2020-12-22,Box Score,Golden State Warriors,99,Brooklyn Nets,125,True,0,0,,...,,,,,,,0,0,0,0
1,2020-12-22,Box Score,Los Angeles Clippers,116,Los Angeles Lakers,109,False,0,0,,...,,,,,,,0,0,0,0
2,2020-12-23,Box Score,Charlotte Hornets,114,Cleveland Cavaliers,121,True,0,0,,...,,,,,,,0,0,1,0
3,2020-12-23,Box Score,New York Knicks,107,Indiana Pacers,121,True,0,0,,...,,,,,,,0,0,0,0
4,2020-12-23,Box Score,Miami Heat,107,Orlando Magic,113,True,0,0,,...,,,,,,,0,0,1,0
5,2020-12-23,Box Score,Washington Wizards,107,Philadelphia 76ers,113,True,0,0,,...,,,,,,,0,0,0,0


In [40]:
X_home_higher =  results[["HomeTeamRanksHigher", "HomeTeamWonLast"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_home_higher, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
Accuracy: 52.4%


In [41]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
encoding = LabelEncoder()
encoding.fit(results["Home Team"].values)
home_teams = encoding.transform(results["Home Team"].values)
visitor_teams = encoding.transform(results["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T

onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 60.0%




In [42]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Using full team labels is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))



Using full team labels is ranked higher
Accuracy: 61.0%




In [43]:
X_all = np.hstack([X_home_higher, X_teams])
print(X_all.shape)

(1036, 62)


In [44]:
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))



Using whether the home team is ranked higher
Accuracy: 61.3%




In [45]:
#n_estimators=10, criterion='gini', max_depth=None, 
#min_samples_split=2, min_samples_leaf=1,
#max_features='auto',
#max_leaf_nodes=None, bootstrap=True,
#oob_score=False, n_jobs=1,
#random_state=None, verbose=0, min_density=None, compute_importances=None
parameter_space = {
                   "max_features": [2, 10, 'auto'],
                   "n_estimators": [100,],
                   "criterion": ["gini", "entropy"],
                   "min_samples_leaf": [2, 4, 6],
                   }
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)















Accuracy: 63.8%
RandomForestClassifier(criterion='entropy', min_samples_leaf=4, random_state=14)
