In [8]:
import pandas as pd
import numpy as np

In [9]:
def normalize(data):
    max_data = np.max(data, axis=0)
    min_data = np.min(data, axis=0)
    stats = ['away_wins', 'away_losses', 'away_ot',
             'away_pts', 'away_ptPctg', 'away_goalsPerGame',
             'away_goalsAgainstPerGame', 'away_evGGARatio',
             'away_powerPlayPercentage', 'away_powerPlayGoals',
             'away_powerPlayGoalsAgainst', 'away_powerPlayOpportunities',
             'away_penaltyKillPercentage', 'away_shotsPerGame', 'away_shotsAllowed',
             'away_winScoreFirst', 'away_winOppScoreFirst', 'away_winLeadFirstPer',
             'away_winLeadSecondPer', 'away_winOutshootOpp', 'away_winOutshotByOpp',
             'away_faceOffsTaken', 'away_faceOffsWon', 'away_faceOffsLost',
             'away_faceOffWinPercentage', 'away_shootingPctg', 'away_savePctg',
             'home_wins', 'home_losses', 'home_ot', 'home_pts', 'home_ptPctg',
             'home_goalsPerGame', 'home_goalsAgainstPerGame', 'home_evGGARatio',
             'home_powerPlayPercentage', 'home_powerPlayGoals',
             'home_powerPlayGoalsAgainst', 'home_powerPlayOpportunities',
             'home_penaltyKillPercentage', 'home_shotsPerGame', 'home_shotsAllowed',
             'home_winScoreFirst', 'home_winOppScoreFirst', 'home_winLeadFirstPer',
             'home_winLeadSecondPer', 'home_winOutshootOpp', 'home_winOutshotByOpp',
             'home_faceOffsTaken', 'home_faceOffsWon', 'home_faceOffsLost',
             'home_faceOffWinPercentage', 'home_shootingPctg', 'home_savePctg']
    for stat in stats:
        data[stat] = (data[stat] - min_data[stat])/(max_data[stat] - min_data[stat])
    return data

In [10]:
def prepare(data):
    X = data.iloc[:,3:].values
    # we insert an all-ones column at index 0
    X = np.insert(X, 0, 1, axis=1)
    # get the first column of the data
    y = data.iloc[:,0:1].values
    return X,y

In [11]:
def split_train_test(X,y,pct=80):
    n = X.shape[0]
    s = round(n * pct / 100)
    
    indices = np.random.permutation(n)
    train_idx, test_idx = indices[:s], indices[s:]
    
    X_train, X_test = X[train_idx,:], X[test_idx,:]
    y_train, y_test = y[train_idx,:], y[test_idx,:]
    
    return X_train, y_train, X_test, y_test

In [17]:
data_2000_2001 = pd.read_csv('game_data/game_data_2000_2001.csv', header=0)
data_2001_2002 = pd.read_csv('game_data/game_data_2001_2002.csv', header=0)
data_2002_2003 = pd.read_csv('game_data/game_data_2002_2003.csv', header=0)
data_2003_2004 = pd.read_csv('game_data/game_data_2003_2004.csv', header=0)
data_2005_2006 = pd.read_csv('game_data/game_data_2005_2006.csv', header=0)
data_2006_2007 = pd.read_csv('game_data/game_data_2006_2007.csv', header=0)
data_2007_2008 = pd.read_csv('game_data/game_data_2007_2008.csv', header=0)
data_2008_2009 = pd.read_csv('game_data/game_data_2008_2009.csv', header=0)
data_2009_2010 = pd.read_csv('game_data/game_data_2009_2010.csv', header=0)
data_2010_2011 = pd.read_csv('game_data/game_data_2010_2011.csv', header=0)
data_2011_2012 = pd.read_csv('game_data/game_data_2011_2012.csv', header=0)
data_2012_2013 = pd.read_csv('game_data/game_data_2012_2013.csv', header=0)
data_2013_2014 = pd.read_csv('game_data/game_data_2013_2014.csv', header=0)
data_2014_2015 = pd.read_csv('game_data/game_data_2014_2015.csv', header=0)
data_2015_2016 = pd.read_csv('game_data/game_data_2015_2016.csv', header=0)
data_2016_2017 = pd.read_csv('game_data/game_data_2016_2017.csv', header=0)
data_2017_2018 = pd.read_csv('game_data/game_data_2017_2018.csv', header=0)

#each one of these data sets needs to be normalized 
data_2000_2001 = normalize(data_2000_2001)
data_2001_2002 = normalize(data_2001_2002)
data_2002_2003 = normalize(data_2002_2003)
data_2003_2004 = normalize(data_2003_2004)
data_2005_2006 = normalize(data_2005_2006)
data_2006_2007 = normalize(data_2006_2007)
data_2007_2008 = normalize(data_2007_2008)
data_2008_2009 = normalize(data_2008_2009)
data_2009_2010 = normalize(data_2009_2010)
data_2010_2011 = normalize(data_2010_2011)
data_2011_2012 = normalize(data_2011_2012)
data_2012_2013 = normalize(data_2012_2013)
data_2013_2014 = normalize(data_2013_2014)
data_2014_2015 = normalize(data_2014_2015)
data_2016_2017 = normalize(data_2016_2017)
data_2017_2018 = normalize(data_2017_2018)


frames = [data_2000_2001, data_2001_2002, data_2002_2003, data_2003_2004, data_2005_2006, 
          data_2006_2007, data_2007_2008, data_2008_2009, data_2009_2010, data_2010_2011, 
          data_2011_2012, data_2012_2013, data_2013_2014, data_2014_2015, data_2015_2016, 
          data_2016_2017, data_2017_2018]
data = pd.concat(frames)

data = data.drop(['awayID', 'homeID', 'away_ot',
       'away_pts', 'away_goalsPerGame',
       'away_goalsAgainstPerGame', 'away_evGGARatio',
       'away_powerPlayGoals', 'away_powerPlayGoalsAgainst', 'away_powerPlayOpportunities',
       'away_winScoreFirst', 'away_winOppScoreFirst', 'away_winLeadFirstPer',
       'away_winLeadSecondPer', 'away_winOutshootOpp', 'away_winOutshotByOpp',
       'away_faceOffsTaken', 'away_faceOffsWon', 'away_faceOffsLost',            
       'home_ot', 'home_pts', 
       'home_goalsPerGame', 'home_goalsAgainstPerGame', 'home_evGGARatio',
       'home_powerPlayGoals',
       'home_powerPlayGoalsAgainst', 'home_powerPlayOpportunities',
       'home_winScoreFirst', 'home_winOppScoreFirst', 'home_winLeadFirstPer',
       'home_winLeadSecondPer', 'home_winOutshootOpp', 'home_winOutshotByOpp',
       'home_faceOffsTaken', 'home_faceOffsWon', 'home_faceOffsLost'], axis=1)

X,y = prepare(data)

X,Y,X_test,Y_test = split_train_test(X,y,pct=80)
Y = np.concatenate(Y, axis=0 )

In [18]:
data.head()

Unnamed: 0,winner,away_wins,away_losses,away_ptPctg,away_powerPlayPercentage,away_penaltyKillPercentage,away_shotsPerGame,away_shotsAllowed,away_faceOffWinPercentage,away_shootingPctg,...,home_wins,home_losses,home_ptPctg,home_powerPlayPercentage,home_penaltyKillPercentage,home_shotsPerGame,home_shotsAllowed,home_faceOffWinPercentage,home_shootingPctg,home_savePctg
0,0,1.0,0.0,1.0,0.932331,0.475248,0.599051,0.174518,0.571429,0.885714,...,0.870968,0.228571,0.816377,0.75188,0.821782,0.135223,0.152349,0.56044,0.942857,0.733333
1,0,0.870968,0.142857,0.863524,0.676692,0.841584,0.726409,0.418283,0.263736,0.828571,...,0.483871,0.4,0.545906,0.511278,0.475248,0.54245,0.357339,0.340659,0.4,0.0
2,1,0.258065,0.685714,0.287841,0.24812,0.594059,0.229561,0.355954,0.417582,0.428571,...,0.806452,0.4,0.69727,0.488722,1.0,0.298733,0.362882,0.428571,0.485714,1.0
3,1,0.483871,0.342857,0.575682,0.56391,0.287129,0.660371,0.261766,0.087912,0.457143,...,0.709677,0.257143,0.727047,0.458647,0.445545,0.688671,0.292238,0.461538,0.457143,0.566667
4,0,0.903226,0.114286,0.8933,0.93985,0.772277,1.0,0.398896,0.912088,0.371429,...,0.193548,0.571429,0.317618,0.398496,0.188119,0.339617,0.403053,0.604396,0.171429,0.266667


In [19]:
from sklearn import svm, datasets

Linear SVC

In [20]:
linear_svc = svm.SVC(kernel='linear')

In [21]:
linear_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [22]:
linear_Y_pred = linear_svc.predict(X_test)

In [23]:
linear_Y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [24]:
def accuracy(pred, labels):
    count = 0
    for i in range(0, len(pred)):
        if(pred[i] == labels[i]):
            count += 1
    return count/len(pred)

In [25]:
lin_acc = accuracy(linear_Y_pred, Y_test)

In [26]:
print(lin_acc)

0.590867992766727


In [27]:
from sklearn.metrics import classification_report, confusion_matrix   

In [28]:
print(confusion_matrix(Y_test, linear_Y_pred))  
print(classification_report(Y_test, linear_Y_pred)) 

[[ 875 1158]
 [ 652 1739]]
             precision    recall  f1-score   support

          0       0.57      0.43      0.49      2033
          1       0.60      0.73      0.66      2391

avg / total       0.59      0.59      0.58      4424



Polynomial SVC - degree 2

In [29]:
poly_svc = svm.SVC(kernel='poly', degree=2)

In [30]:
poly_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [31]:
poly_Y_pred = poly_svc.predict(X_test)  

In [32]:
poly_acc = accuracy(poly_Y_pred, Y_test)

In [33]:
print(poly_acc)

0.5922242314647378


In [34]:
print(confusion_matrix(Y_test, poly_Y_pred))  
print(classification_report(Y_test, poly_Y_pred)) 

[[ 828 1205]
 [ 599 1792]]
             precision    recall  f1-score   support

          0       0.58      0.41      0.48      2033
          1       0.60      0.75      0.67      2391

avg / total       0.59      0.59      0.58      4424



Polynomial SVC - degree 5

In [47]:
poly5_svc = svm.SVC(kernel='poly', degree=3)

In [48]:
poly5_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [49]:
poly5_Y_pred = poly5_svc.predict(X_test)  

In [50]:
poly5_acc = accuracy(poly5_Y_pred, Y_test)

In [51]:
print(poly5_acc)

0.5782097649186256


In [52]:
print(confusion_matrix(Y_test, poly5_Y_pred))  
print(classification_report(Y_test, poly5_Y_pred)) 

[[ 962 1071]
 [ 795 1596]]
             precision    recall  f1-score   support

          0       0.55      0.47      0.51      2033
          1       0.60      0.67      0.63      2391

avg / total       0.58      0.58      0.57      4424



Gaussian SVC 

In [35]:
gaussian_svc = svm.SVC(kernel='rbf')

In [36]:
gaussian_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [37]:
gaussian_Y_pred = gaussian_svc.predict(X_test)  

In [38]:
gaussian_acc = accuracy(gaussian_Y_pred, Y_test)

In [39]:
print(gaussian_acc)

0.6017179023508138


In [40]:
print(confusion_matrix(Y_test, poly_Y_pred))  
print(classification_report(Y_test, poly_Y_pred)) 

[[ 828 1205]
 [ 599 1792]]
             precision    recall  f1-score   support

          0       0.58      0.41      0.48      2033
          1       0.60      0.75      0.67      2391

avg / total       0.59      0.59      0.58      4424



Sigmoid SVC

In [41]:
sigmoid_svc = svm.SVC(kernel='sigmoid')

In [42]:
sigmoid_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [43]:
sigmoid_Y_pred = sigmoid_svc.predict(X_test)  

In [44]:
sigmoid_acc = accuracy(sigmoid_Y_pred, Y_test)

In [45]:
print(sigmoid_acc)

0.5384267631103075


In [46]:
print(confusion_matrix(Y_test, sigmoid_Y_pred))  
print(classification_report(Y_test, sigmoid_Y_pred)) 

[[1018 1015]
 [1027 1364]]
             precision    recall  f1-score   support

          0       0.50      0.50      0.50      2033
          1       0.57      0.57      0.57      2391

avg / total       0.54      0.54      0.54      4424



In [53]:
linear_Y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [54]:
poly_Y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [55]:
poly5_Y_pred

array([0, 1, 0, ..., 1, 1, 1])

In [56]:
gaussian_Y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [57]:
sigmoid_Y_pred

array([1, 0, 1, ..., 1, 1, 0])

In [64]:
Y_tmp = np.concatenate( Y_test, axis=0 )
Y_tmp

array([0, 0, 1, ..., 0, 1, 0])

In [58]:
def ensamble(pred1, pred2, pred3, pred4, pred5):
    prediction = []
    for i in range(0, len(pred1)):
        p = (pred1[i] + pred2[i] + pred3[i] + pred4[i] + pred5[i])
        p = p/5
        if(p < 0.5):
            prediction.append(0)
        else:
            prediction.append(1)
    return prediction

In [59]:
ensamble_pred = ensamble(linear_Y_pred, poly_Y_pred, poly5_Y_pred, gaussian_Y_pred, sigmoid_Y_pred)

In [60]:
ensamble_acc = accuracy(ensamble_pred, Y_test)

In [61]:
print(ensamble_acc)

0.5992314647377939
