In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score #lo deberia de usar?
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC #Linear Support Vector Machines (SVM), should be faster.
from sklearn.svm import SVC #Support Vector Machines (SVM)
from sklearn.tree import DecisionTreeClassifier #Decision Tree (DT)
from sklearn.neighbors import KNeighborsClassifier #K Nearest Neighbors (KNN)
import warnings
warnings.filterwarnings('ignore', category = UserWarning)


In [2]:

def selectBestModel(models,names,*datasets):
    xtrain, xtest, ytrain, ytest = datasets
    modelsScores = [model.score(xtest,ytest) for model in models]
    indexbestScore = modelsScores.index(max(modelsScores))
    bestModel = models[indexbestScore]
    bestModel.fit(xtrain,ytrain)
    return bestModel, names[indexbestScore], bestModel.score(xtrain,ytrain), bestModel.score(xtest,ytest)

#['accuracy','precision','recall','roc_auc']
def modelling(typeModel,cv,n,*datasets):
    xtrain, xtest, ytrain, ytest = datasets
    if typeModel == 'SVM':
        svm = SVC()
        paramGridSVM = {'kernel':['linear','rbf','poly'], 'C':np.arange(1,n+1,0.1), 
                        'gamma':['scale','auto'], 'degree':np.arange(1,3+1)}
        model = GridSearchCV(estimator = svm, param_grid = paramGridSVM, 
                             scoring = 'accuracy', cv = cv, n_jobs = -1)
    
    elif typeModel == 'SVMlinear':
        svmlinear = LinearSVC()
        paramGridSVMlinear = {'penalty':['l1','l2'], 'loss':['hinge','squared_hinge'], 'dual':[False],
                             'C':np.arange(1,n+1,0.1), 'max_iter':[1000]}
        model = GridSearchCV(estimator = svmlinear, param_grid = paramGridSVMlinear,
                            scoring = 'accuracy', cv = cv, n_jobs = -1)
        
    elif typeModel == 'KNN':
        knn = KNeighborsClassifier()
        paramGridKNN = {'weights':['uniform','distance'], 'metric':['euclidean','manhattan','chebyshev','minkowski'], 
         'n_neighbors':np.arange(1,n+1)}
        model = GridSearchCV(estimator = knn, param_grid = paramGridKNN, 
                             scoring = 'accuracy', cv = cv, n_jobs = -1)
    
    elif typeModel == 'DT':
        dt = DecisionTreeClassifier()
        paramGridDT = {'criterion':['gini','entropy']}
        model = GridSearchCV(estimator = dt, param_grid = paramGridDT, 
                             scoring = 'accuracy', cv = cv, n_jobs = -1)
    
    #Fit the model.
    model.fit(xtrain,ytrain)
    #Best model
    return model

In [3]:
spreads_and_scores = pd.read_csv('spreads_and_scores_clean.csv')

In [4]:
# favored level will be 0 if slightly favored, 1 if mediumly favored or 2 if favored heavily
def determine_spread_favorite_lvl(spread):
    if spread < 3:
        return 0
    elif spread >= 3 and spread <=7:
        return 1
    else:
        return 2
spreads_and_scores['favored_level'] = spreads_and_scores.apply(lambda row: determine_spread_favorite_lvl(abs(row['spread_favorite'])), axis=1)
spreads_and_scores

Unnamed: 0,schedule_season,schedule_week,schedule_playoff,team_home,team_away,spread_favorite,over_under_line,stadium,weather_temperature,weather_wind_mph,weather_humidity,team_favorite_away,team_favorite_home,total_score,did_home_team_win,favored_level
0,0,1,0,0,0,-3.0,35.0,0,64.0,8.0,66.0,1,0,24.0,1,1
1,0,1,0,1,1,-1.0,34.0,1,72.0,9.0,81.0,1,0,29.0,0,0
2,0,1,0,2,2,-6.0,35.0,2,65.0,5.0,77.0,1,0,18.0,1,1
3,0,1,0,3,3,-3.0,42.0,3,82.0,10.0,58.0,0,1,41.0,0,1
4,0,1,0,4,4,-9.0,38.0,4,66.0,11.0,70.0,0,1,61.0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10438,41,19,1,26,19,-4.0,48.5,51,35.0,0.0,43.0,0,1,35.0,0,1
10439,41,19,1,3,26,-2.5,54.0,3,35.0,6.0,54.0,0,1,78.0,1,0
10440,41,19,1,14,14,-3.0,48.0,48,50.0,11.0,69.0,0,1,57.0,0,1
10441,41,20,1,3,19,-7.0,54.5,3,41.0,4.0,41.0,0,1,51.0,0,1


In [5]:
#Select features and labels
features = spreads_and_scores[['team_favorite_home', 'favored_level']]
labels = spreads_and_scores['did_home_team_win']

In [6]:
#We are going to make different sub-datasets.

#Pareto 80-20
train_features1, test_features1, train_labels1, test_labels1 = train_test_split(features, labels, test_size=0.2, random_state=5)

#70-30
train_features2, test_features2, train_labels2, test_labels2 = train_test_split(features, labels, test_size=0.3, random_state=5)

#75-25
train_features3, test_features3, train_labels3, test_labels3 = train_test_split(features, labels, test_size=0.25, random_state=5)


In [7]:
#%%time
knn1_3cv = modelling('KNN',3,15,train_features1, test_features1, train_labels1, test_labels1)
knn1_5cv = modelling('KNN',5,15,train_features1, test_features1, train_labels1, test_labels1)
knn1_10cv = modelling('KNN',10,15,train_features1, test_features1, train_labels1, test_labels1)
#knn1_loocv = modelling('KNN',LeaveOneOut(),15,train_features1, test_features1, train_labels1, test_labels1)

In [8]:
#%%time
#modelsNamesKNN1 = ['KNN D1 '+ i for i in ('3CV','5CV','10CV','LOOCV')]
#modelsKNN1 = [knn1_3cv, knn1_5cv, knn1_10cv, knn1_loocv]
modelsNamesKNN1 = ['KNN D1 '+ i for i in ('3CV','5CV','10CV')]
modelsKNN1 = [knn1_3cv, knn1_5cv, knn1_10cv]
bestModelKNN1, namebestModelKNN1, bestModelKNN1scoretrain, bestModelKNN1scoretest = selectBestModel(modelsKNN1,modelsNamesKNN1, train_features1, test_features1, train_labels1, test_labels1)
print("El mejor es {} con un accuracy en el training de {} y con un accuracy en el testing de {}".format(namebestModelKNN1,bestModelKNN1scoretrain, bestModelKNN1scoretest))

El mejor es KNN D1 5CV con un accuracy en el training de 0.6497486234139335 y con un accuracy en el testing de 0.6615605552896122


In [9]:
#%%time
knn2_3cv = modelling('KNN',3,15,train_features2, test_features2, train_labels2, test_labels2)
knn2_5cv = modelling('KNN',5,15,train_features2, test_features2, train_labels2, test_labels2)
knn2_10cv = modelling('KNN',10,15,train_features2, test_features2, train_labels2, test_labels2)
#knn2_loocv = modelling('KNN',LeaveOneOut(),15,train_features2, test_features2, train_labels2, test_labels2)

In [10]:
#%%time
#modelsNamesKNN2 = ['KNN D2 '+ i for i in ('3CV','5CV','10CV','LOOCV')]
#modelsKNN2 = [knn2_3cv, knn2_5cv, knn2_10cv, knn2_loocv]
modelsNamesKNN2 = ['KNN D2 '+ i for i in ('3CV','5CV','10CV')]
modelsKNN2 = [knn2_3cv, knn2_5cv, knn2_10cv]
bestModelKNN2, namebestModelKNN2, bestModelKNN2scoretrain, bestModelKNN2scoretest = selectBestModel(modelsKNN2,modelsNamesKNN2, train_features2, test_features2, train_labels2, test_labels2)
print("El mejor es {} con un accuracy en el training de {} y con un accuracy en el testing de {}".format(namebestModelKNN2,bestModelKNN2scoretrain, bestModelKNN2scoretest))

El mejor es KNN D2 5CV con un accuracy en el training de 0.5961696306429548 y con un accuracy en el testing de 0.5965528247685924


In [11]:
#%%time
knn3_3cv = modelling('KNN',3,15,train_features3, test_features3, train_labels3, test_labels3)
knn3_5cv = modelling('KNN',5,15,train_features3, test_features3, train_labels3, test_labels3)
knn3_10cv = modelling('KNN',10,15,train_features3, test_features3, train_labels3, test_labels3)
#knn3_loocv = modelling('KNN',LeaveOneOut(),15,train_features3, test_features3, train_labels3, test_labels3)

In [12]:
#%%time
#modelsNamesKNN3 = ['KNN D3 '+ i for i in ('3CV','5CV','10CV','LOOCV')]
#modelsKNN3 = [knn3_3cv, knn3_5cv, knn3_10cv, knn3_loocv]
modelsNamesKNN3 = ['KNN D3 '+ i for i in ('3CV','5CV','10CV')]
modelsKNN3 = [knn3_3cv, knn3_5cv, knn3_10cv]
bestModelKNN3, namebestModelKNN3, bestModelKNN3scoretrain, bestModelKNN3scoretest = selectBestModel(modelsKNN3,modelsNamesKNN3, train_features3, test_features3, train_labels3, test_labels3)
print("El mejor es {} con un accuracy en el training de {} y con un accuracy en el testing de {}".format(namebestModelKNN3,bestModelKNN3scoretrain, bestModelKNN3scoretest))

El mejor es KNN D3 10CV con un accuracy en el training de 0.6502808988764045 y con un accuracy en el testing de 0.669858291842206


In [13]:
#%%time
dt1_3cv = modelling('DT',3,0,train_features1, test_features1, train_labels1, test_labels1)
dt1_5cv = modelling('DT',5,0,train_features1, test_features1, train_labels1, test_labels1)
dt1_10cv = modelling('DT',10,0,train_features1, test_features1, train_labels1, test_labels1)
#dt1_loocv = modelling('DT',LeaveOneOut(),0,train_features1, test_features1, train_labels1, test_labels1)

In [14]:
#%%time
#modelsNamesDT1 = ['DT D1 '+ i for i in ('3CV','5CV','10CV','LOOCV')]
#modelsDT1 = [dt1_3cv, dt1_5cv, dt1_10cv, dt1_loocv]
modelsNamesDT1 = ['DT D1 '+ i for i in ('3CV','5CV','10CV')]
modelsDT1 = [dt1_3cv, dt1_5cv, dt1_10cv]
bestModelDT1, namebestModelDT1, bestModelDT1scoretrain, bestModelDT1scoretest = selectBestModel(modelsDT1,modelsNamesDT1, train_features1, test_features1, train_labels1, test_labels1)
print("El mejor es {} con un accuracy en el training de {} y con un accuracy en el testing de {}".format(namebestModelDT1,bestModelDT1scoretrain, bestModelDT1scoretest))

El mejor es DT D1 3CV con un accuracy en el training de 0.6548958582714867 y con un accuracy en el testing de 0.6692197223551939


In [15]:
#%%time
dt2_3cv = modelling('DT',3,0,train_features2, test_features2, train_labels2, test_labels2)
dt2_5cv = modelling('DT',5,0,train_features2, test_features2, train_labels2, test_labels2)
dt2_10cv = modelling('DT',10,0,train_features2, test_features2, train_labels2, test_labels2)
#dt2_loocv = modelling('DT',LeaveOneOut(),0,train_features2, test_features2, train_labels2, test_labels2)

In [16]:
#%%time
#modelsNamesDT2 = ['DT D2 '+ i for i in ('3CV','5CV','10CV','LOOCV')]
#modelsDT2 = [dt2_3cv, dt2_5cv, dt2_10cv, dt2_loocv]
modelsNamesDT2 = ['DT D2 '+ i for i in ('3CV','5CV','10CV')]
modelsDT2 = [dt2_3cv, dt2_5cv, dt2_10cv]
bestModelDT2, namebestModelDT2, bestModelDT2scoretrain, bestModelDT2scoretest = selectBestModel(modelsDT2,modelsNamesDT2, train_features2, test_features2, train_labels2, test_labels2)
print("El mejor es {} con un accuracy en el training de {} y con un accuracy en el testing de {}".format(namebestModelDT2,bestModelDT2scoretrain, bestModelDT2scoretest))

El mejor es DT D2 3CV con un accuracy en el training de 0.6518467852257182 y con un accuracy en el testing de 0.6715608043408873


In [17]:
#%%time
dt3_3cv = modelling('DT',3,0,train_features3, test_features3, train_labels3, test_labels3)
dt3_5cv = modelling('DT',5,0,train_features3, test_features3, train_labels3, test_labels3)
dt3_10cv = modelling('DT',10,0,train_features3, test_features3, train_labels3, test_labels3)
#dt3_loocv = modelling('DT',LeaveOneOut(),0,train_features3, test_features3, train_labels3, test_labels3)

In [18]:
#%%time
#modelsNamesDT3 = ['DT D3 '+ i for i in ('3CV','5CV','10CV','LOOCV')]
#modelsDT3 = [dt3_3cv, dt3_5cv, dt3_10cv, dt3_loocv]
modelsNamesDT3 = ['DT D3 '+ i for i in ('3CV','5CV','10CV')]
modelsDT3 = [dt3_3cv, dt3_5cv, dt3_10cv]
bestModelDT3, namebestModelDT3, bestModelDT3scoretrain, bestModelDT3scoretest = selectBestModel(modelsDT3,modelsNamesDT3, train_features3, test_features3, train_labels3, test_labels3)
print("El mejor es {} con un accuracy en el training de {} y con un accuracy en el testing de {}".format(namebestModelDT3,bestModelDT3scoretrain, bestModelDT3scoretest))

El mejor es DT D3 3CV con un accuracy en el training de 0.6533452502553626 y con un accuracy en el testing de 0.6710072769054002


In [19]:
#%%time
svm1_3cv = modelling('SVMlinear',3,15,train_features1, test_features1, train_labels1, test_labels1)
svm1_5cv = modelling('SVMlinear',5,15,train_features1, test_features1, train_labels1, test_labels1)
svm1_10cv = modelling('SVMlinear',10,15,train_features1, test_features1, train_labels1, test_labels1)
#svm1_loocv = modelling('SVM',LeaveOneOut(),15,train_features1, test_features1, train_labels1, test_labels1)

In [20]:
#%%time
#modelsNamesSVM1 = ['SVM D1 '+ i for i in ('3CV','5CV','10CV','LOOCV')]
#modelsSVM1 = [svm1_3cv, svm1_5cv, svm1_10cv, svm1_loocv]
modelsNamesSVM1 = ['SVM D1 '+ i for i in ('3CV','5CV','10CV')]
modelsSVM1 = [svm1_3cv, svm1_5cv, svm1_10cv]
bestModelSVM1, namebestModelSVM1, bestModelSVM1scoretrain, bestModelSVM1scoretest = selectBestModel(modelsSVM1,modelsNamesSVM1, train_features1, test_features1, train_labels1, test_labels1)
print("El mejor es {} con un accuracy en el training de {} y con un accuracy en el testing de {}".format(namebestModelSVM1,bestModelSVM1scoretrain, bestModelSVM1scoretest))

El mejor es SVM D1 3CV con un accuracy en el training de 0.6548958582714867 y con un accuracy en el testing de 0.6692197223551939


In [21]:
#%%time
svm2_3cv = modelling('SVMlinear',3,15,train_features2, test_features2, train_labels2, test_labels2)
svm2_5cv = modelling('SVMlinear',5,15,train_features2, test_features2, train_labels2, test_labels2)
svm2_10cv = modelling('SVMlinear',10,15,train_features2, test_features2, train_labels2, test_labels2)
#svm2_loocv = modelling('SVM',LeaveOneOut(),15,train_features2, test_features2, train_labels2, test_labels2)

In [22]:
#%%time
#modelsNamesSVM2 = ['SVM D2 '+ i for i in ('3CV','5CV','10CV','LOOCV')]
#modelsSVM2 = [svm2_3cv, svm2_5cv, svm2_10cv, svm2_loocv]
modelsNamesSVM2 = ['SVM D2 '+ i for i in ('3CV','5CV','10CV')]
modelsSVM2 = [svm2_3cv, svm2_5cv, svm2_10cv]
bestModelSVM2, namebestModelSVM2, bestModelSVM2scoretrain, bestModelSVM2scoretest = selectBestModel(modelsSVM2,modelsNamesSVM2, train_features2, test_features2, train_labels2, test_labels2)
print("El mejor es {} con un accuracy en el training de {} y con un accuracy en el testing de {}".format(namebestModelSVM2,bestModelSVM2scoretrain, bestModelSVM2scoretest))

El mejor es SVM D2 3CV con un accuracy en el training de 0.6518467852257182 y con un accuracy en el testing de 0.6715608043408873


In [23]:
#%%time
svm3_3cv = modelling('SVMlinear',3,15,train_features3, test_features3, train_labels3, test_labels3)
svm3_5cv = modelling('SVMlinear',5,15,train_features3, test_features3, train_labels3, test_labels3)
svm3_10cv = modelling('SVMlinear',10,15,train_features3, test_features3, train_labels3, test_labels3)
#svm3_loocv = modelling('SVM',LeaveOneOut(),15,train_features3, test_features3, train_labels3, test_labels3)

In [24]:
#%%time
#modelsNamesSVM3 = ['SVM D3 '+ i for i in ('3CV','5CV','10CV','LOOCV')]
#modelsSVM3 = [svm3_3cv, svm3_5cv, svm3_10cv, svm3_loocv]
modelsNamesSVM3 = ['SVM D3 '+ i for i in ('3CV','5CV','10CV')]
modelsSVM3 = [svm3_3cv, svm3_5cv, svm3_10cv]
bestModelSVM3, namebestModelSVM3, bestModelSVM3scoretrain, bestModelSVM3scoretest = selectBestModel(modelsSVM3,modelsNamesSVM3, train_features3, test_features3, train_labels3, test_labels3)
print("El mejor es {} con un accuracy en el training de {} y con un accuracy en el testing de {}".format(namebestModelSVM3,bestModelSVM3scoretrain, bestModelSVM3scoretest))

El mejor es SVM D3 3CV con un accuracy en el training de 0.6533452502553626 y con un accuracy en el testing de 0.6710072769054002
