In [29]:
import pandas as pd
import numpy as np

In [30]:
def normalize(data):
    max_data = np.max(data, axis=0)
    min_data = np.min(data, axis=0)
    stats = ['away_wins', 'away_losses', 'away_ot',
             'away_pts', 'away_ptPctg', 'away_goalsPerGame',
             'away_goalsAgainstPerGame', 'away_evGGARatio',
             'away_powerPlayPercentage', 'away_powerPlayGoals',
             'away_powerPlayGoalsAgainst', 'away_powerPlayOpportunities',
             'away_penaltyKillPercentage', 'away_shotsPerGame', 'away_shotsAllowed',
             'away_winScoreFirst', 'away_winOppScoreFirst', 'away_winLeadFirstPer',
             'away_winLeadSecondPer', 'away_winOutshootOpp', 'away_winOutshotByOpp',
             'away_faceOffsTaken', 'away_faceOffsWon', 'away_faceOffsLost',
             'away_faceOffWinPercentage', 'away_shootingPctg', 'away_savePctg',
             'home_wins', 'home_losses', 'home_ot', 'home_pts', 'home_ptPctg',
             'home_goalsPerGame', 'home_goalsAgainstPerGame', 'home_evGGARatio',
             'home_powerPlayPercentage', 'home_powerPlayGoals',
             'home_powerPlayGoalsAgainst', 'home_powerPlayOpportunities',
             'home_penaltyKillPercentage', 'home_shotsPerGame', 'home_shotsAllowed',
             'home_winScoreFirst', 'home_winOppScoreFirst', 'home_winLeadFirstPer',
             'home_winLeadSecondPer', 'home_winOutshootOpp', 'home_winOutshotByOpp',
             'home_faceOffsTaken', 'home_faceOffsWon', 'home_faceOffsLost',
             'home_faceOffWinPercentage', 'home_shootingPctg', 'home_savePctg']
    for stat in stats:
        data[stat] = (data[stat] - min_data[stat])/(max_data[stat] - min_data[stat])
    return data

In [31]:
def prepare(data):
    X = data.iloc[:,3:].values
    # we insert an all-ones column at index 0
    X = np.insert(X, 0, 1, axis=1)
    # get the first column of the data
    y = data.iloc[:,0:1].values
    return X,y

In [32]:
def split_train_test(X,y,pct=80):
    n = X.shape[0]
    s = round(n * pct / 100)
    
    indices = np.random.permutation(n)
    train_idx, test_idx = indices[:s], indices[s:]
    
    X_train, X_test = X[train_idx,:], X[test_idx,:]
    y_train, y_test = y[train_idx,:], y[test_idx,:]
    
    return X_train, y_train, X_test, y_test

In [33]:
data_2000_2001 = pd.read_csv('game_data/game_data_2000_2001.csv', header=0)
data_2001_2002 = pd.read_csv('game_data/game_data_2001_2002.csv', header=0)
data_2002_2003 = pd.read_csv('game_data/game_data_2002_2003.csv', header=0)
data_2003_2004 = pd.read_csv('game_data/game_data_2003_2004.csv', header=0)
data_2005_2006 = pd.read_csv('game_data/game_data_2005_2006.csv', header=0)
data_2006_2007 = pd.read_csv('game_data/game_data_2006_2007.csv', header=0)
data_2007_2008 = pd.read_csv('game_data/game_data_2007_2008.csv', header=0)
data_2008_2009 = pd.read_csv('game_data/game_data_2008_2009.csv', header=0)
data_2009_2010 = pd.read_csv('game_data/game_data_2009_2010.csv', header=0)
data_2010_2011 = pd.read_csv('game_data/game_data_2010_2011.csv', header=0)
data_2011_2012 = pd.read_csv('game_data/game_data_2011_2012.csv', header=0)
data_2012_2013 = pd.read_csv('game_data/game_data_2012_2013.csv', header=0)
data_2013_2014 = pd.read_csv('game_data/game_data_2013_2014.csv', header=0)
data_2014_2015 = pd.read_csv('game_data/game_data_2014_2015.csv', header=0)
data_2015_2016 = pd.read_csv('game_data/game_data_2015_2016.csv', header=0)
data_2016_2017 = pd.read_csv('game_data/game_data_2016_2017.csv', header=0)
data_2017_2018 = pd.read_csv('game_data/game_data_2017_2018.csv', header=0)

#each one of these data sets needs to be normalized 
data_2000_2001 = normalize(data_2000_2001)
data_2001_2002 = normalize(data_2001_2002)
data_2002_2003 = normalize(data_2002_2003)
data_2003_2004 = normalize(data_2003_2004)
data_2005_2006 = normalize(data_2005_2006)
data_2006_2007 = normalize(data_2006_2007)
data_2007_2008 = normalize(data_2007_2008)
data_2008_2009 = normalize(data_2008_2009)
data_2009_2010 = normalize(data_2009_2010)
data_2010_2011 = normalize(data_2010_2011)
data_2011_2012 = normalize(data_2011_2012)
data_2012_2013 = normalize(data_2012_2013)
data_2013_2014 = normalize(data_2013_2014)
data_2014_2015 = normalize(data_2014_2015)
data_2016_2017 = normalize(data_2016_2017)
data_2017_2018 = normalize(data_2017_2018)


frames = [data_2000_2001, data_2001_2002, data_2002_2003, data_2003_2004, data_2005_2006, 
          data_2006_2007, data_2007_2008, data_2008_2009, data_2009_2010, data_2010_2011, 
          data_2011_2012, data_2012_2013, data_2013_2014, data_2014_2015, data_2015_2016, 
          data_2016_2017, data_2017_2018]
data = pd.concat(frames)

# # data = data.drop(['winner', 'awayID', 'homeID', 'away_wins', 'away_losses', 'away_ot',
#        'away_pts', 'away_ptPctg', 'away_goalsPerGame',
#        'away_goalsAgainstPerGame', 'away_evGGARatio',
#        'away_powerPlayPercentage', 'away_powerPlayGoals',
#        'away_powerPlayGoalsAgainst', 'away_powerPlayOpportunities',
#        'away_penaltyKillPercentage', 'away_shotsPerGame', 'away_shotsAllowed',
#        'away_winScoreFirst', 'away_winOppScoreFirst', 'away_winLeadFirstPer',
#        'away_winLeadSecondPer', 'away_winOutshootOpp', 'away_winOutshotByOpp',
#        'away_faceOffsTaken', 'away_faceOffsWon', 'away_faceOffsLost',
#        'away_faceOffWinPercentage', 'away_shootingPctg', 'away_savePctg',
#        'away_PDO', 'home_wins', 'home_losses', 'home_ot', 'home_pts',
#        'home_ptPctg', 'home_goalsPerGame', 'home_goalsAgainstPerGame',
#        'home_evGGARatio', 'home_powerPlayPercentage', 'home_powerPlayGoals',
#        'home_powerPlayGoalsAgainst', 'home_powerPlayOpportunities',
#        'home_penaltyKillPercentage', 'home_shotsPerGame', 'home_shotsAllowed',
#        'home_winScoreFirst', 'home_winOppScoreFirst', 'home_winLeadFirstPer',
#        'home_winLeadSecondPer', 'home_winOutshootOpp', 'home_winOutshotByOpp',
#        'home_faceOffsTaken', 'home_faceOffsWon', 'home_faceOffsLost',
#        'home_faceOffWinPercentage', 'home_shootingPctg', 'home_savePctg',
#        'home_PDO'], axis=1)
data = data.drop(['away_wins', 'away_losses', 'away_ot',
       'away_pts', 'away_ptPctg', 'away_powerPlayGoals',
       'away_powerPlayGoalsAgainst', 'away_powerPlayOpportunities','away_shotsPerGame', 'away_shotsAllowed',
       'away_winScoreFirst', 'away_winOppScoreFirst', 'away_winLeadFirstPer',
       'away_winLeadSecondPer', 'away_winOutshootOpp', 'away_winOutshotByOpp',
       'away_faceOffsTaken', 'away_faceOffsWon', 'away_faceOffsLost',
       'away_faceOffWinPercentage',
       'home_wins', 'home_losses', 'home_ot',
       'home_pts', 'home_ptPctg', 'home_powerPlayGoals',
       'home_powerPlayGoalsAgainst', 'home_powerPlayOpportunities','home_shotsPerGame', 'home_shotsAllowed',
       'home_winScoreFirst', 'home_winOppScoreFirst', 'home_winLeadFirstPer',
       'home_winLeadSecondPer', 'home_winOutshootOpp', 'home_winOutshotByOpp',
       'home_faceOffsTaken', 'home_faceOffsWon', 'home_faceOffsLost',
       'home_faceOffWinPercentage'], axis=1)
cols = data.columns

X,y = prepare(data)

X,Y,X_test,Y_test = split_train_test(X,y,pct=80)
Y = np.concatenate(Y, axis=0 )

In [34]:
data.head()

Unnamed: 0,winner,awayID,homeID,away_goalsPerGame,away_goalsAgainstPerGame,away_evGGARatio,away_powerPlayPercentage,away_penaltyKillPercentage,away_shootingPctg,away_savePctg,away_PDO,home_goalsPerGame,home_goalsAgainstPerGame,home_evGGARatio,home_powerPlayPercentage,home_penaltyKillPercentage,home_shootingPctg,home_savePctg,home_PDO
0,0,21,25,0.803099,0.075019,0.789802,0.932331,0.475248,0.885714,0.7,1.02,0.574564,0.027842,0.620057,0.75188,0.821782,0.942857,0.733333,1.023
1,0,9,6,0.834087,0.197989,0.734781,0.676692,0.841584,0.828571,0.733333,1.019,0.46417,0.613302,0.795786,0.511278,0.475248,0.4,0.0,0.982
2,1,16,7,0.330536,0.584687,0.664542,0.24812,0.594059,0.428571,0.066667,0.985,0.393802,0.0,0.502732,0.488722,1.0,0.485714,1.0,1.015
3,1,23,4,0.55907,0.508894,0.7673,0.56391,0.287129,0.457143,0.066667,0.986,0.566817,0.216551,1.0,0.458647,0.445545,0.457143,0.566667,1.001
4,0,17,20,0.668819,0.169374,0.849896,0.93985,0.772277,0.371429,0.766667,1.004,0.227889,0.490333,0.444459,0.398496,0.188119,0.171429,0.266667,0.982


In [79]:
cols

Index(['winner', 'awayID', 'homeID', 'away_goalsPerGame',
       'away_goalsAgainstPerGame', 'away_evGGARatio',
       'away_powerPlayPercentage', 'away_penaltyKillPercentage',
       'away_shootingPctg', 'away_savePctg', 'away_PDO', 'home_goalsPerGame',
       'home_goalsAgainstPerGame', 'home_evGGARatio',
       'home_powerPlayPercentage', 'home_penaltyKillPercentage',
       'home_shootingPctg', 'home_savePctg', 'home_PDO'],
      dtype='object')

In [35]:
from sklearn import svm, datasets

Linear SVC

In [36]:
linear_svc = svm.SVC(kernel='linear')

In [37]:
linear_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [38]:
linear_Y_pred = linear_svc.predict(X_test)

In [39]:
linear_Y_pred

array([1, 1, 1, ..., 1, 0, 0])

In [40]:
def accuracy(pred, labels):
    count = 0
    for i in range(0, len(pred)):
        if(pred[i] == labels[i]):
            count += 1
    return count/len(pred)

In [41]:
lin_acc = accuracy(linear_Y_pred, Y_test)

In [42]:
print(lin_acc)

0.5951627486437613


In [43]:
from sklearn.metrics import classification_report, confusion_matrix   

In [44]:
print(confusion_matrix(Y_test, linear_Y_pred))  
print(classification_report(Y_test, linear_Y_pred)) 

[[ 909 1101]
 [ 690 1724]]
             precision    recall  f1-score   support

          0       0.57      0.45      0.50      2010
          1       0.61      0.71      0.66      2414

avg / total       0.59      0.60      0.59      4424



Polynomial SVC - degree 2

In [45]:
poly_svc = svm.SVC(kernel='poly', degree=2)

In [46]:
poly_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [47]:
poly_Y_pred = poly_svc.predict(X_test)  

In [48]:
poly_acc = accuracy(poly_Y_pred, Y_test)

In [49]:
print(poly_acc)

0.5940325497287523


In [50]:
print(confusion_matrix(Y_test, poly_Y_pred))  
print(classification_report(Y_test, poly_Y_pred)) 

[[ 689 1321]
 [ 475 1939]]
             precision    recall  f1-score   support

          0       0.59      0.34      0.43      2010
          1       0.59      0.80      0.68      2414

avg / total       0.59      0.59      0.57      4424



Polynomial SVC - degree 5

In [51]:
poly5_svc = svm.SVC(kernel='poly', degree=3)

In [52]:
poly5_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [53]:
poly5_Y_pred = poly5_svc.predict(X_test)  

In [54]:
poly5_acc = accuracy(poly5_Y_pred, Y_test)

In [55]:
print(poly5_acc)

0.5892857142857143


In [56]:
print(confusion_matrix(Y_test, poly5_Y_pred))  
print(classification_report(Y_test, poly5_Y_pred)) 

[[ 728 1282]
 [ 535 1879]]
             precision    recall  f1-score   support

          0       0.58      0.36      0.44      2010
          1       0.59      0.78      0.67      2414

avg / total       0.59      0.59      0.57      4424



Gaussian SVC 

In [57]:
gaussian_svc = svm.SVC(kernel='rbf')

In [58]:
gaussian_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [59]:
gaussian_Y_pred = gaussian_svc.predict(X_test)  

In [60]:
gaussian_acc = accuracy(gaussian_Y_pred, Y_test)

In [61]:
print(gaussian_acc)

0.5910940325497287


In [62]:
print(confusion_matrix(Y_test, poly_Y_pred))  
print(classification_report(Y_test, poly_Y_pred)) 

[[ 689 1321]
 [ 475 1939]]
             precision    recall  f1-score   support

          0       0.59      0.34      0.43      2010
          1       0.59      0.80      0.68      2414

avg / total       0.59      0.59      0.57      4424



Sigmoid SVC

In [63]:
sigmoid_svc = svm.SVC(kernel='sigmoid')

In [64]:
sigmoid_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [65]:
sigmoid_Y_pred = sigmoid_svc.predict(X_test)  

In [66]:
sigmoid_acc = accuracy(sigmoid_Y_pred, Y_test)

In [67]:
print(sigmoid_acc)

0.5397830018083183


In [68]:
print(confusion_matrix(Y_test, sigmoid_Y_pred))  
print(classification_report(Y_test, sigmoid_Y_pred)) 

[[ 986 1024]
 [1012 1402]]
             precision    recall  f1-score   support

          0       0.49      0.49      0.49      2010
          1       0.58      0.58      0.58      2414

avg / total       0.54      0.54      0.54      4424



In [69]:
linear_Y_pred

array([1, 1, 1, ..., 1, 0, 0])

In [70]:
poly_Y_pred

array([1, 1, 1, ..., 1, 1, 0])

In [71]:
poly5_Y_pred

array([1, 1, 1, ..., 1, 1, 0])

In [72]:
gaussian_Y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [73]:
sigmoid_Y_pred

array([0, 1, 1, ..., 0, 1, 0])

In [74]:
Y_tmp = np.concatenate( Y_test, axis=0 )
Y_tmp

array([1, 1, 1, ..., 0, 0, 0])

In [75]:
def ensamble(pred1, pred2, pred3, pred4, pred5):
    prediction = []
    for i in range(0, len(pred1)):
        p = (pred1[i] + pred2[i] + pred3[i] + pred4[i] + pred5[i])
        p = p/5
        if(p < 0.5):
            prediction.append(0)
        else:
            prediction.append(1)
    return prediction

In [76]:
ensamble_pred = ensamble(linear_Y_pred, poly_Y_pred, poly5_Y_pred, gaussian_Y_pred, sigmoid_Y_pred)

In [77]:
ensamble_acc = accuracy(ensamble_pred, Y_test)

In [78]:
print(ensamble_acc)

0.5906419529837251
