In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

In [2]:
#import dataset with redundant features removed
df_NBA_slim = pd.read_csv('NBA_numeric_slim.csv')
df_NBA_slim.head()

Unnamed: 0,season,age,w,sos,o_rtg,d_rtg,pace,f_tr,x3p_ar,ts_percent,...,away_orb_percent,away_opp_e_fg_percent,away_opp_tov_percent,away_opp_drb_percent,away_opp_ft_fga,away_attend,away_attend_g,home_score,away_score,spread
0,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,27.2,0.481,14.2,70.1,0.274,656081.0,16002.0,120,117,3
1,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,29.5,0.497,14.7,70.9,0.269,591701.0,14432.0,90,93,-3
2,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,28.4,0.454,14.4,72.6,0.262,828384.0,20204.0,105,114,-9
3,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,28.4,0.482,15.1,71.7,0.247,723949.0,17657.0,110,107,3
4,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,31.2,0.461,13.5,73.0,0.207,905116.0,22076.0,68,95,-27


In [5]:
#no ties in the NBA
df_NBA_slim['home win'] = df_NBA_slim['spread'] > 0
df_NBA_slim['home win'].value_counts(dropna=False)

home win
True     6762
False    4724
Name: count, dtype: int64

In [22]:
#make 10 bins of spread values
pd.set_option('display.max_columns', 1000)
spread_names=['-5','-4','-3','-2', '-1', '+1', '+2','+3','+4','+5'] 
df_NBA_slim['spread_cat'], bins = pd.cut(df_NBA_slim['spread'], 10, retbins=True)
df_NBA_slim['spread_cat']

0           (2.5, 14.2]
1           (-9.2, 2.5]
2           (-9.2, 2.5]
3           (2.5, 14.2]
4        (-32.6, -20.9]
              ...      
11481       (-9.2, 2.5]
11482       (2.5, 14.2]
11483       (-9.2, 2.5]
11484     (-20.9, -9.2]
11485     (-20.9, -9.2]
Name: spread_cat, Length: 11486, dtype: category
Categories (10, interval[float64, right]): [(-56.117, -44.3] < (-44.3, -32.6] < (-32.6, -20.9] < (-20.9, -9.2] ... (14.2, 25.9] < (25.9, 37.6] < (37.6, 49.3] < (49.3, 61.0]]

In [23]:
bins


array([-56.117, -44.3  , -32.6  , -20.9  ,  -9.2  ,   2.5  ,  14.2  ,
        25.9  ,  37.6  ,  49.3  ,  61.   ])

In [24]:
#bin by deciles
df_NBA_slim['dec_spread_cat'], bins = pd.qcut(df_NBA_slim['spread'], 10, retbins=True)
df_NBA_slim

Unnamed: 0,season,age,w,sos,o_rtg,d_rtg,pace,f_tr,x3p_ar,ts_percent,tov_percent,orb_percent,opp_e_fg_percent,opp_tov_percent,opp_drb_percent,opp_ft_fga,attend,attend_g,away_age,away_w,away_sos,away_o_rtg,away_d_rtg,away_pace,away_f_tr,away_x3p_ar,away_ts_percent,away_tov_percent,away_orb_percent,away_opp_e_fg_percent,away_opp_tov_percent,away_opp_drb_percent,away_opp_ft_fga,away_attend,away_attend_g,home_score,away_score,spread,home win,spread_cat,dec_spread_cat
0,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0,27.1,45.0,-0.52,107.5,106.6,93.3,0.357,0.192,0.551,14.7,27.2,0.481,14.2,70.1,0.274,656081.0,16002.0,120,117,3,True,"(2.5, 14.2]","(-1.0, 4.0]"
1,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0,24.9,18.0,-0.16,101.1,107.5,92.3,0.306,0.128,0.497,13.3,29.5,0.497,14.7,70.9,0.269,591701.0,14432.0,90,93,-3,False,"(-9.2, 2.5]","(-5.0, -1.0]"
2,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0,25.0,47.0,-0.41,101.4,100.3,92.4,0.311,0.217,0.517,15.5,28.4,0.454,14.4,72.6,0.262,828384.0,20204.0,105,114,-9,False,"(-9.2, 2.5]","(-14.0, -9.0]"
3,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0,26.7,49.0,0.20,106.0,103.9,93.5,0.350,0.142,0.535,13.8,28.4,0.482,15.1,71.7,0.247,723949.0,17657.0,110,107,3,True,"(2.5, 14.2]","(-1.0, 4.0]"
4,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,14.9,30.7,0.513,14.0,72.1,0.289,586390.0,14302.0,27.9,54.0,-0.55,105.6,101.2,87.2,0.335,0.164,0.519,13.3,31.2,0.461,13.5,73.0,0.207,905116.0,22076.0,68,95,-27,False,"(-32.6, -20.9]","(-56.001, -14.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11481,2020,25.1,25.0,-0.57,110.9,115.5,102.7,0.270,0.358,0.562,12.2,22.2,0.558,13.9,75.3,0.231,532702.0,16647.0,29.2,56.0,-0.67,112.4,102.9,105.1,0.271,0.428,0.583,12.9,20.7,0.489,12.0,81.6,0.178,549036.0,17711.0,134,137,-3,False,"(-9.2, 2.5]","(-5.0, -1.0]"
11482,2020,25.1,25.0,-0.57,110.9,115.5,102.7,0.270,0.358,0.562,12.2,22.2,0.558,13.9,75.3,0.231,532702.0,16647.0,24.5,21.0,-0.26,106.5,113.0,98.6,0.263,0.318,0.531,12.6,25.8,0.541,12.4,78.3,0.224,620789.0,18812.0,122,115,7,True,"(2.5, 14.2]","(4.0, 7.0]"
11483,2020,25.1,25.0,-0.57,110.9,115.5,102.7,0.270,0.358,0.562,12.2,22.2,0.558,13.9,75.3,0.231,532702.0,16647.0,26.1,33.0,0.09,108.5,109.5,98.6,0.256,0.364,0.544,11.5,22.3,0.535,13.2,79.1,0.176,529870.0,17093.0,113,120,-7,False,"(-9.2, 2.5]","(-9.0, -5.0]"
11484,2020,25.1,25.0,-0.57,110.9,115.5,102.7,0.270,0.358,0.562,12.2,22.2,0.558,13.9,75.3,0.231,532702.0,16647.0,27.5,35.0,0.54,113.7,114.8,100.7,0.242,0.374,0.570,11.2,22.4,0.530,11.2,75.3,0.208,628303.0,19634.0,103,122,-19,False,"(-20.9, -9.2]","(-56.001, -14.0]"


In [25]:
bins


array([-56., -14.,  -9.,  -5.,  -1.,   4.,   7.,   9.,  13.,  19.,  61.])