<img src="nhl_banner.png" width="300">

# Introduction

In our ever changing and evolving field of technology one of the foremost topics is data analytics. The growth in the amount of digital data that is being collected across many different fields is massive and . That is, taking data in whatever raw form it exists and using technology to transform it into information that has value and context. In most cases data analysis is performed in order to provide class descriptions of data, highlight behaviors, trends, associations in the data or predictive information that prove useful or even vital to key decision-makers. 


<img src="map.png" width="800" >

In [None]:
import numpy as np
import pandas as pd
import requests
import json
import matplotlib.pyplot as plt

Data Collection
--
Using the NHL API following the documentation found at https://gitlab.com/dword4/nhlapi/tree/master

For each player in the specified year range (years must be consecutive) collect all avalible stats 

In [None]:
def get_csv_skaters(y1, y2):
    team_rosters = requests.get('https://statsapi.web.nhl.com/api/v1/teams?expand=team.roster&season=' + y1 + y2)
    team_rosters = team_rosters.json()
    players= []
    for i in range(0, len(team_rosters['teams'])):
        for j in range(0, len(team_rosters['teams'][i]['roster']['roster'])):
            player = [team_rosters['teams'][i]['roster']['roster'][j]['person']['id'], 
                      team_rosters['teams'][i]['roster']['roster'][j]['person']['fullName'],
                     team_rosters['teams'][i]['name']]
            if (team_rosters['teams'][i]['roster']['roster'][j]['position']['code'] != 'G'):
                players.append(player)
    players_stats = []
    labels = requests.get('https://statsapi.web.nhl.com/api/v1/people/' 
                           + str(players[i][0]) 
                           + '/stats?stats=statsSingleSeason&season=' + y1 + y2).json()
    labels = labels['stats'][0]['splits'][0]['stat']
    header = ['id', 'fullName', 'teamName']
    for label in labels:
        header.append(label)
    for i in range(0, len(players)): 
        stats = requests.get('https://statsapi.web.nhl.com/api/v1/people/' 
                           + str(players[i][0]) 
                           + '/stats?stats=statsSingleSeason&season=' + y1 + y2).json()
        if(stats['stats'][0]['splits'] == []):
            players_stats.append([0] * len(labels))
            continue
        stats = stats['stats'][0]['splits'][0]['stat']
        stats_array = []
        for label in labels:
            if label in stats:
                stats_array.append(stats[label])
            else:
                stats_array.append(0)
        players_stats.append(stats_array)
        
    skaters = []
    skaters.append(header)
    for i in range(0, len(players)):
        skaters.append(players[i] + players_stats[i])
    return skaters

Returns the skaters data for a year range and saves the result as a csv file.
Years must be in the range [1917, 2019], note that the 2004-2005 season is skipped as this was a lockout year.

In [None]:
def get_skaters_data(start, end):
    for i in range(start, end):
        if(i == 2004):
            continue
        print("Getting skaters data for " + str(i) + "-" + str(i+1) + " season.")
        skaters = get_csv_skaters(str(i), str(i+1))
        np.savetxt('data/skaters_' + str(i) + '_' + str(i+1) + '.csv', skaters, fmt='%s', delimiter=',')

In [None]:
get_skaters_data(1917,1918)

For each team in the specified year range (years must be consecutive) collect all avalible stats 

In [None]:
def get_csv_team(y1, y2):
    teams = requests.get('https://statsapi.web.nhl.com/api/v1/teams?season=' + str(y1) + str(y2))
    teams = teams.json()
    team_id_name = []
    for i in range(0, len(teams['teams'])):
        team_arr = [teams['teams'][i]['id'], teams['teams'][i]['name']]
        team_id_name.append(team_arr)

    labels = requests.get('https://statsapi.web.nhl.com/api/v1/teams/' 
                           + str(team_id_name[0][0])
                           + '/stats?stats=statsSingleSeason&season=' + str(y1) + str(y2)).json()
    labels = labels['stats'][0]['splits'][0]['stat']
    
    header = ['id', 'teamName']
    for label in labels:
        header.append(label)
    header.append('PDO')
    team_stats = []
    for i in range(0, len(team_id_name)):
        stats = requests.get('https://statsapi.web.nhl.com/api/v1/teams/' 
                             + str(team_id_name[i][0]) 
                             + '/stats?stats=statsSingleSeason&season=' + str(y1) + str(y2)).json()
        if(stats['stats'][0]['splits'] == []):
            team_stats.append([0] * len(labels))
            continue
        stats = stats['stats'][0]['splits'][0]['stat']
        stats_array = []
        for label in labels:
            if label in stats:
                stats_array.append(stats[label])
            else:
                stats_array.append(0)
        team_stats.append(stats_array)
        stats_array.append((stats['shootingPctg']/100) + stats['savePctg'])
    
    teams_stats_final = []
    teams_stats_final.append(header)
    for i in range(0, len(team_id_name)):
        teams_stats_final.append(team_id_name[i] + team_stats[i]) 
    return teams_stats_final

Gets the team data for a year range and saves the result as a csv file.
Years must be in the range [1917, 2019], note that the 2004-2005 season is skipped as this was a lockout year.

In [None]:
def get_team_data(start, end):
    for i in range(start, end):
        if(i == 2004):
            continue
        print("Getting team data for " + str(i) + "-" + str(i+1) + " season.")
        data = get_csv_team(str(i), str(i+1))
        np.savetxt('team_data/teams_' + str(i) + '_' + str(i+1) + '.csv', data, fmt='%s', delimiter=',')

In [None]:
get_team_data(2000, 2018)

Returns the index of the team stats 

In [None]:
def get_team_stats(id, team_data):
    for i in range(0, len(team_data)):
        if team_data[i][0] == id:
            return i

For each game in the specified year range (years must be consecutive) return the winner, away team ID, home team ID, and the away and home team stats for that season

In [None]:
def get_csv_game(y1, y2):
    team_data = get_csv_team(y1, y2)
    header = team_data[0][3:]
    away_header = []
    home_header = []
    for head in header:
        away_header.append('away_'+head)
        home_header.append('home_'+head)
    games_data = [['winner', 'awayID', 'homeID'] + away_header + home_header]
    games = requests.get('https://statsapi.web.nhl.com/api/v1/schedule?startDate=' 
                         + str(y1) + '-10-01&endDate=' + str(y2) + '-06-30')
    games = games.json()
    for date in games['dates']:
        for game in date['games']:
            away_ID = game['teams']['away']['team']['id']
            home_ID = game['teams']['home']['team']['id']
            if away_ID > 80 or home_ID > 80:
                continue
            
            away_score = game['teams']['away']['score']
            home_score = game['teams']['home']['score']
            winner = 0
            away_stats = team_data[get_team_stats(away_ID, team_data)][3:]
            home_stats = team_data[get_team_stats(home_ID, team_data)][3:]
            if home_score > away_score:
                winner = 1
            games_data.append([winner,
                          away_ID, 
                          home_ID] +
                          away_stats +
                          home_stats)
    return games_data

Gets the game data for a year range and saves the result as a csv file.
Years must be in the range [1917, 2019], note that the 2004-2005 season is skipped as this was a lockout year.

In [None]:
def get_game_data(start, end):
    for i in range(start, end):
        if(i == 2004):
            continue
        print("Getting game data for " + str(i) + "-" + str(i+1) + " season.")
        data = get_csv_game(i, i+1)
        np.savetxt('game_data/game_data_' + str(i) + '_' + str(i+1) + '.csv', data, fmt='%s', delimiter=',')

In [None]:
get_game_data(2000,2018)

Setup master csv and normalized master csv file of all the games data for later use

In [None]:
data_2000_2001 = pd.read_csv('game_data/game_data_2000_2001.csv', header=0)
data_2001_2002 = pd.read_csv('game_data/game_data_2001_2002.csv', header=0)
data_2002_2003 = pd.read_csv('game_data/game_data_2002_2003.csv', header=0)
data_2003_2004 = pd.read_csv('game_data/game_data_2003_2004.csv', header=0)
data_2005_2006 = pd.read_csv('game_data/game_data_2005_2006.csv', header=0)
data_2006_2007 = pd.read_csv('game_data/game_data_2006_2007.csv', header=0)
data_2007_2008 = pd.read_csv('game_data/game_data_2007_2008.csv', header=0)
data_2008_2009 = pd.read_csv('game_data/game_data_2008_2009.csv', header=0)
data_2009_2010 = pd.read_csv('game_data/game_data_2009_2010.csv', header=0)
data_2010_2011 = pd.read_csv('game_data/game_data_2010_2011.csv', header=0)
data_2011_2012 = pd.read_csv('game_data/game_data_2011_2012.csv', header=0)
data_2012_2013 = pd.read_csv('game_data/game_data_2012_2013.csv', header=0)
data_2013_2014 = pd.read_csv('game_data/game_data_2013_2014.csv', header=0)
data_2014_2015 = pd.read_csv('game_data/game_data_2014_2015.csv', header=0)
data_2015_2016 = pd.read_csv('game_data/game_data_2015_2016.csv', header=0)
data_2016_2017 = pd.read_csv('game_data/game_data_2016_2017.csv', header=0)
data_2017_2018 = pd.read_csv('game_data/game_data_2017_2018.csv', header=0)

frames = [data_2000_2001, data_2001_2002, data_2002_2003, data_2003_2004, data_2005_2006, 
          data_2006_2007, data_2007_2008, data_2008_2009, data_2009_2010, data_2010_2011, 
          data_2011_2012, data_2012_2013, data_2013_2014, data_2014_2015, data_2015_2016, 
          data_2016_2017, data_2017_2018]

data = pd.concat(frames)

data = data.drop(['away_wins', 'away_losses', 'away_ot',
       'away_pts', 'away_ptPctg', 'away_powerPlayGoals',
       'away_powerPlayGoalsAgainst', 'away_powerPlayOpportunities','away_shotsPerGame', 'away_shotsAllowed',
       'away_winScoreFirst', 'away_winOppScoreFirst', 'away_winLeadFirstPer',
       'away_winLeadSecondPer', 'away_winOutshootOpp', 'away_winOutshotByOpp',
       'away_faceOffsTaken', 'away_faceOffsWon', 'away_faceOffsLost',
       'away_faceOffWinPercentage',
       'home_wins', 'home_losses', 'home_ot',
       'home_pts', 'home_ptPctg', 'home_powerPlayGoals',
       'home_powerPlayGoalsAgainst', 'home_powerPlayOpportunities','home_shotsPerGame', 'home_shotsAllowed',
       'home_winScoreFirst', 'home_winOppScoreFirst', 'home_winLeadFirstPer',
       'home_winLeadSecondPer', 'home_winOutshootOpp', 'home_winOutshotByOpp',
       'home_faceOffsTaken', 'home_faceOffsWon', 'home_faceOffsLost',
       'home_faceOffWinPercentage'], axis=1)
header = ['winner', 'awayID', 'homeID', 'away_goalsPerGame',
       'away_goalsAgainstPerGame', 'away_evGGARatio',
       'away_powerPlayPercentage', 'away_penaltyKillPercentage',
       'away_shootingPctg', 'away_savePctg', 'away_PDO', 'home_goalsPerGame',
       'home_goalsAgainstPerGame', 'home_evGGARatio',
       'home_powerPlayPercentage', 'home_penaltyKillPercentage',
       'home_shootingPctg', 'home_savePctg', 'home_PDO']
header = ','.join(header)

np.savetxt('master.csv', data, fmt='%s', delimiter=',', header = header)

Data Preprocessing
--

In [None]:
def normalize(data):
    max_data = np.max(data, axis=0)
    min_data = np.min(data, axis=0)
    stats = ['away_wins', 'away_losses', 'away_ot',
             'away_pts', 'away_ptPctg', 'away_goalsPerGame',
             'away_goalsAgainstPerGame', 'away_evGGARatio',
             'away_powerPlayPercentage', 'away_powerPlayGoals',
             'away_powerPlayGoalsAgainst', 'away_powerPlayOpportunities',
             'away_penaltyKillPercentage', 'away_shotsPerGame', 'away_shotsAllowed',
             'away_winScoreFirst', 'away_winOppScoreFirst', 'away_winLeadFirstPer',
             'away_winLeadSecondPer', 'away_winOutshootOpp', 'away_winOutshotByOpp',
             'away_faceOffsTaken', 'away_faceOffsWon', 'away_faceOffsLost',
             'away_faceOffWinPercentage', 'away_shootingPctg', 'away_savePctg', 'away_PDO',
             'home_wins', 'home_losses', 'home_ot', 'home_pts', 'home_ptPctg',
             'home_goalsPerGame', 'home_goalsAgainstPerGame', 'home_evGGARatio',
             'home_powerPlayPercentage', 'home_powerPlayGoals',
             'home_powerPlayGoalsAgainst', 'home_powerPlayOpportunities',
             'home_penaltyKillPercentage', 'home_shotsPerGame', 'home_shotsAllowed',
             'home_winScoreFirst', 'home_winOppScoreFirst', 'home_winLeadFirstPer',
             'home_winLeadSecondPer', 'home_winOutshootOpp', 'home_winOutshotByOpp',
             'home_faceOffsTaken', 'home_faceOffsWon', 'home_faceOffsLost',
             'home_faceOffWinPercentage', 'home_shootingPctg', 'home_savePctg', 'home_PDO']
    for stat in stats:
        data[stat] = (data[stat] - min_data[stat])/(max_data[stat] - min_data[stat])
    return data

In [None]:
data_2000_2001 = pd.read_csv('game_data/game_data_2000_2001.csv', header=0)
data_2001_2002 = pd.read_csv('game_data/game_data_2001_2002.csv', header=0)
data_2002_2003 = pd.read_csv('game_data/game_data_2002_2003.csv', header=0)
data_2003_2004 = pd.read_csv('game_data/game_data_2003_2004.csv', header=0)
data_2005_2006 = pd.read_csv('game_data/game_data_2005_2006.csv', header=0)
data_2006_2007 = pd.read_csv('game_data/game_data_2006_2007.csv', header=0)
data_2007_2008 = pd.read_csv('game_data/game_data_2007_2008.csv', header=0)
data_2008_2009 = pd.read_csv('game_data/game_data_2008_2009.csv', header=0)
data_2009_2010 = pd.read_csv('game_data/game_data_2009_2010.csv', header=0)
data_2010_2011 = pd.read_csv('game_data/game_data_2010_2011.csv', header=0)
data_2011_2012 = pd.read_csv('game_data/game_data_2011_2012.csv', header=0)
data_2012_2013 = pd.read_csv('game_data/game_data_2012_2013.csv', header=0)
data_2013_2014 = pd.read_csv('game_data/game_data_2013_2014.csv', header=0)
data_2014_2015 = pd.read_csv('game_data/game_data_2014_2015.csv', header=0)
data_2015_2016 = pd.read_csv('game_data/game_data_2015_2016.csv', header=0)
data_2016_2017 = pd.read_csv('game_data/game_data_2016_2017.csv', header=0)
data_2017_2018 = pd.read_csv('game_data/game_data_2017_2018.csv', header=0)

data_2000_2001 = normalize(data_2000_2001)
data_2001_2002 = normalize(data_2001_2002)
data_2002_2003 = normalize(data_2002_2003)
data_2003_2004 = normalize(data_2003_2004)
data_2005_2006 = normalize(data_2005_2006)
data_2006_2007 = normalize(data_2006_2007)
data_2007_2008 = normalize(data_2007_2008)
data_2008_2009 = normalize(data_2008_2009)
data_2009_2010 = normalize(data_2009_2010)
data_2010_2011 = normalize(data_2010_2011)
data_2011_2012 = normalize(data_2011_2012)
data_2012_2013 = normalize(data_2012_2013)
data_2013_2014 = normalize(data_2013_2014)
data_2014_2015 = normalize(data_2014_2015)
data_2016_2017 = normalize(data_2016_2017)
data_2017_2018 = normalize(data_2017_2018)

frames = [data_2000_2001, data_2001_2002, data_2002_2003, data_2003_2004, data_2005_2006, 
          data_2006_2007, data_2007_2008, data_2008_2009, data_2009_2010, data_2010_2011, 
          data_2011_2012, data_2012_2013, data_2013_2014, data_2014_2015, data_2015_2016, 
          data_2016_2017, data_2017_2018]

data = pd.concat(frames)

data = data.drop(['away_wins', 'away_losses', 'away_ot',
       'away_pts', 'away_ptPctg', 'away_powerPlayGoals',
       'away_powerPlayGoalsAgainst', 'away_powerPlayOpportunities','away_shotsPerGame', 'away_shotsAllowed',
       'away_winScoreFirst', 'away_winOppScoreFirst', 'away_winLeadFirstPer',
       'away_winLeadSecondPer', 'away_winOutshootOpp', 'away_winOutshotByOpp',
       'away_faceOffsTaken', 'away_faceOffsWon', 'away_faceOffsLost',
       'away_faceOffWinPercentage',
       'home_wins', 'home_losses', 'home_ot',
       'home_pts', 'home_ptPctg', 'home_powerPlayGoals',
       'home_powerPlayGoalsAgainst', 'home_powerPlayOpportunities','home_shotsPerGame', 'home_shotsAllowed',
       'home_winScoreFirst', 'home_winOppScoreFirst', 'home_winLeadFirstPer',
       'home_winLeadSecondPer', 'home_winOutshootOpp', 'home_winOutshotByOpp',
       'home_faceOffsTaken', 'home_faceOffsWon', 'home_faceOffsLost',
       'home_faceOffWinPercentage'], axis=1)
header = ['winner', 'awayID', 'homeID', 'away_goalsPerGame',
       'away_goalsAgainstPerGame', 'away_evGGARatio',
       'away_powerPlayPercentage', 'away_penaltyKillPercentage',
       'away_shootingPctg', 'away_savePctg', 'away_PDO', 'home_goalsPerGame',
       'home_goalsAgainstPerGame', 'home_evGGARatio',
       'home_powerPlayPercentage', 'home_penaltyKillPercentage',
       'home_shootingPctg', 'home_savePctg', 'home_PDO']
header = ','.join(header)

np.savetxt('master_normalized.csv', data, fmt='%s', delimiter=',', header = header)

Anaylsis
--

In [1]:
import pandas as pd
import numpy as np
from sklearn import svm, datasets
from sklearn.metrics import classification_report, confusion_matrix   

In [2]:
def prepare(data):
    X = data.iloc[:,3:].values
    # we insert an all-ones column at index 0
    X = np.insert(X, 0, 1, axis=1)
    # get the first column of the data
    y = data.iloc[:,0:1].values
    return X,y

def split_train_test(X,y,pct=80):
    n = X.shape[0]
    s = round(n * pct / 100)
    
    indices = np.random.permutation(n)
    train_idx, test_idx = indices[:s], indices[s:]
    
    X_train, X_test = X[train_idx,:], X[test_idx,:]
    y_train, y_test = y[train_idx,:], y[test_idx,:]
    
    return X_train, y_train, X_test, y_test

def accuracy(pred, labels):
    count = 0
    for i in range(0, len(pred)):
        if(pred[i] == labels[i]):
            count += 1
    return count/len(pred)

## SVM using various kernals


In [37]:
data = pd.read_csv('master_normalized.csv', header=0)

In [38]:
X,y = prepare(data)

X,Y,X_test,Y_test = split_train_test(X,y,pct=80)
Y = np.concatenate(Y, axis=0 )

In [39]:
data.head()

Unnamed: 0,winner,awayID,homeID,away_goalsPerGame,away_goalsAgainstPerGame,away_evGGARatio,away_powerPlayPercentage,away_penaltyKillPercentage,away_shootingPctg,away_savePctg,away_PDO,home_goalsPerGame,home_goalsAgainstPerGame,home_evGGARatio,home_powerPlayPercentage,home_penaltyKillPercentage,home_shootingPctg,home_savePctg,home_PDO
0,0,21,25,0.803099,0.075019,0.789802,0.932331,0.475248,0.885714,0.7,0.944444,0.574564,0.027842,0.620057,0.75188,0.821782,0.942857,0.733333,1.0
1,0,9,6,0.834087,0.197989,0.734781,0.676692,0.841584,0.828571,0.733333,0.925926,0.46417,0.613302,0.795786,0.511278,0.475248,0.4,0.0,0.240741
2,1,16,7,0.330536,0.584687,0.664542,0.24812,0.594059,0.428571,0.066667,0.296296,0.393802,0.0,0.502732,0.488722,1.0,0.485714,1.0,0.851852
3,1,23,4,0.55907,0.508894,0.7673,0.56391,0.287129,0.457143,0.066667,0.314815,0.566817,0.216551,1.0,0.458647,0.445545,0.457143,0.566667,0.592593
4,0,17,20,0.668819,0.169374,0.849896,0.93985,0.772277,0.371429,0.766667,0.648148,0.227889,0.490333,0.444459,0.398496,0.188119,0.171429,0.266667,0.240741


In [None]:
from sklearn.metrics.pairwise import chi2_kernel
chi2_svc = svm.SVC(kernel='chi2_kernel')

K = chi2_kernel(X, gamma=.5)

In [45]:
chi2_svc.SVC(kernel='precomputed').fit(K, y)

ValueError: 'chi2_kernel' is not in list

In [44]:
chi2_Y_pred = chi2_svc.predict(X_test)

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
chi2_Y_pred

In [None]:
chi2_acc = accuracy(chi2_Y_pred, Y_test)

In [None]:
print(chi2_acc)

In [None]:
print(confusion_matrix(Y_test, chi2_Y_pred))  
print(classification_report(Y_test, chi2_Y_pred)) 

## Linear SVC

In [6]:
linear_svc = svm.SVC(kernel='linear')

In [7]:
linear_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
linear_Y_pred = linear_svc.predict(X_test)

In [9]:
linear_Y_pred

array([0, 1, 1, ..., 1, 0, 1])

In [10]:
lin_acc = accuracy(linear_Y_pred, Y_test)

In [11]:
print(lin_acc)

0.601491862567812


In [12]:
print(confusion_matrix(Y_test, linear_Y_pred))  
print(classification_report(Y_test, linear_Y_pred)) 

[[ 912 1128]
 [ 635 1749]]
             precision    recall  f1-score   support

          0       0.59      0.45      0.51      2040
          1       0.61      0.73      0.66      2384

avg / total       0.60      0.60      0.59      4424



## Polynomial SVC - degree 2

In [19]:
poly_svc = svm.SVC(kernel='poly', degree=2)

In [20]:
poly_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
poly_Y_pred = poly_svc.predict(X_test)  

In [22]:
poly_acc = accuracy(poly_Y_pred, Y_test)

In [23]:
print(poly_acc)

0.594258589511754


In [24]:
print(confusion_matrix(Y_test, poly_Y_pred))  
print(classification_report(Y_test, poly_Y_pred)) 

[[ 637 1403]
 [ 392 1992]]
             precision    recall  f1-score   support

          0       0.62      0.31      0.42      2040
          1       0.59      0.84      0.69      2384

avg / total       0.60      0.59      0.56      4424



## Gaussian SVC 

In [25]:
gaussian_svc = svm.SVC(kernel='rbf')

In [26]:
gaussian_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [27]:
gaussian_Y_pred = gaussian_svc.predict(X_test)  

In [28]:
gaussian_acc = accuracy(gaussian_Y_pred, Y_test)

In [29]:
print(gaussian_acc)

0.6008137432188065


In [30]:
print(confusion_matrix(Y_test, poly_Y_pred))  
print(classification_report(Y_test, poly_Y_pred)) 

[[ 637 1403]
 [ 392 1992]]
             precision    recall  f1-score   support

          0       0.62      0.31      0.42      2040
          1       0.59      0.84      0.69      2384

avg / total       0.60      0.59      0.56      4424



## Sigmoid SVC

In [31]:
sigmoid_svc = svm.SVC(kernel='sigmoid')

In [32]:
sigmoid_svc.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [33]:
sigmoid_Y_pred = sigmoid_svc.predict(X_test)  

In [34]:
sigmoid_acc = accuracy(sigmoid_Y_pred, Y_test)

In [35]:
print(sigmoid_acc)

0.5314195298372514


In [36]:
print(confusion_matrix(Y_test, sigmoid_Y_pred))  
print(classification_report(Y_test, sigmoid_Y_pred)) 

[[ 990 1050]
 [1023 1361]]
             precision    recall  f1-score   support

          0       0.49      0.49      0.49      2040
          1       0.56      0.57      0.57      2384

avg / total       0.53      0.53      0.53      4424



## Ensamble Prediction 

In [None]:
def ensamble(pred1, pred2, pred3, pred4):
    prediction = []
    for i in range(0, len(pred1)):
        p = (pred1[i] + pred2[i] + pred3[i] + pred4[i])
        p = p/4
        if(p < 0.5):
            prediction.append(0)
        else:
            prediction.append(1)
    return prediction

In [None]:
ensamble_pred = ensamble(linear_Y_pred, poly_Y_pred, gaussian_Y_pred, sigmoid_Y_pred)

In [None]:
ensamble_acc = accuracy(ensamble_pred, Y_test)

print(ensamble_acc)

## Logistic regression in TensorFlow

In [None]:
import tensorflow as tf

In [None]:
Y = Y.reshape((Y.shape[0],1))
Y_test = Y_test.reshape((Y_test.shape[0],1))

print("Train dataset shape", X.shape, Y.shape)
print("Test dataset shape", X_test.shape, Y_test.shape)

m   = X.shape[0] 
n_x = X.shape[1]

In [None]:
def accuracy(A, Y):
    P = A>.5      #prediction
    num_agreements = np.sum(P==Y)
    return num_agreements / Y.shape[0]

In [None]:
# Input data.
# Load the training and test data into constants
tf_X = tf.constant(X.astype(np.float32))
tf_Y = tf.constant(Y.astype(np.float32))
tf_X_test = tf.constant(X_test.astype(np.float32))
tf_Y_test = tf.constant(Y_test.astype(np.float32))

# Variables.
# These are the parameters that we are going to be training.
tf_w = tf.Variable(tf.zeros((n_x, 1)))
tf_b = tf.Variable(tf.zeros((1,1)))

# Training computation.
# We multiply the inputs with the weight matrix, and add biases. We compute
# the sigmoid and cross-entropy (it's one operation in TensorFlow, because
# it's very common, and it can be optimized). We take the average of this
# cross-entropy across all training examples: that's our cost.
tf_Z = tf.matmul(tf_X, tf_w) + tf_b
tf_J = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_Y, logits=tf_Z) )

# Optimizer.
# We are going to find the minimum of this loss using gradient descent.
# We pass alpha=0.1 as input parameter.
optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(tf_J)

# Predictions for the train and test data.
# These are not part of training, but merely here so that we can report
# accuracy figures as we train.
tf_A = tf.nn.sigmoid(tf_Z)
tf_A_test = tf.nn.sigmoid(tf.matmul(tf_X_test, tf_w) + tf_b)

In [None]:
session = tf.InteractiveSession()

# This is a one-time operation which ensures the parameters get initialized as
# we described in the graph: random weights for the matrix, zeros for the biases. 
tf.global_variables_initializer().run()
print("Initialized")

for iter in range(1000):
    # Run the computations. We tell .run() that we want to run the optimizer,
    # and get the cost value and the training predictions returned as numpy arrays.
    _, J, A = session.run([optimizer, tf_J, tf_A])
    
    print(iter, J)

In [None]:
# Calling .eval() is basically like calling run(), but
# just to get that one numpy array. 
# Note that it recomputes all its computation graph dependencies.
A = tf_A.eval()
A_test = tf_A_test.eval()

print("Accuracy on the train set is ", accuracy(A,Y))
print("Accuracy on the test set is ", accuracy(A_test,Y_test))

In [None]:
tensor_pred = A_test

## Neural network in TensorFlow

In [None]:
# Input data.
n_x = X.shape[1]

num_hidden_nodes = 15

learning_rate = 0.01

C = 1

# Load the training and test data into constants
tf_X = tf.constant(X.astype(np.float32))
tf_Y = tf.constant(Y.astype(np.float32))
tf_X_test = tf.constant(X_test.astype(np.float32))
tf_Y_test = tf.constant(Y_test.astype(np.float32))

# Variables.
# These are the parameters that we are going to be training.
tf_w1 = tf.Variable(tf.truncated_normal((n_x, num_hidden_nodes)))
tf_b1 = tf.Variable(tf.zeros((1, num_hidden_nodes)))
tf_w2 = tf.Variable(tf.truncated_normal([num_hidden_nodes, C]))
tf_b2 = tf.Variable(tf.zeros((1, C)))



tf_Z1 = tf.matmul(tf_X, tf_w1) + tf_b1
tf_A1 = tf.nn.relu(tf_Z1)    #tf.nn.relu(tf_Z1)
tf_Z2 = tf.matmul(tf_A1, tf_w2) + tf_b2
tf_A2 = tf.nn.relu(tf_Z2)

# Training computation.
# We multiply the inputs with the weight matrix, and add biases. We compute
# the sigmoid and cross-entropy (it's one operation in TensorFlow, because
# it's very common, and it can be optimized). We take the average of this
# cross-entropy across all training examples: that's our cost.
tf_J = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_Y, logits=tf_Z2) )

# Optimizer.
# optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(tf_J)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(tf_J)


# Predictions for the test data.
tf_Z1_test = tf.matmul(tf_X_test, tf_w1) + tf_b1
tf_A1_test = tf.nn.relu(tf_Z1_test)
tf_Z2_test = tf.matmul(tf_A1_test, tf_w2) + tf_b2
tf_A2_test = tf.nn.relu(tf_Z2_test)

In [None]:
session = tf.InteractiveSession()

# This is a one-time operation which ensures the parameters get initialized as
# we described in the graph: random weights for the matrix, zeros for the biases. 
tf.global_variables_initializer().run()
print("Initialized")


# Replace None with your code.

for iter in range(800):
    # Run the computations. We tell .run() that we want to run the optimizer,
    # and get the cost value and the training predictions returned as numpy arrays.
    # Print out the iteration number and cost every 50 iterations.
    _, J, A = session.run([optimizer, tf_J, tf_A2])
    
    if iter%50 ==0:
        print(iter, J)

In [None]:
# Print out the accuracy for the training set and test set.
A = tf_A2.eval()
A_test = tf_A2_test.eval()

print("Accuracy on the train set is ", accuracy(A,Y))
print("Accuracy on the test set is ", accuracy(A_test,Y_test))
# Put your code here.

# Call .eval() on tf_A2 and tf_A2_test.

In [None]:
def convert_pred(pred):
    for i in range(0, len(pred)):
        if(pred[i]<0.5):
            pred[i] = 0
        else:
            pred[i] = 1
    return pred

In [None]:
nn_pred = A_test

## Stochastic Gradient Descent

In [None]:
# Input data.
# Let's use placeholders for the training data. 
# This is so that we can suply batches of tranining examples each iteration.
tf_X = tf.placeholder(tf.float32)
tf_Y = tf.placeholder(tf.float32)

tf_X_test = tf.constant(X_test.astype(np.float32))
tf_Y_test = tf.constant(Y_test.astype(np.float32))

# Variables.
# These are the parameters that we are going to be training.
tf_w = tf.Variable( tf.zeros((n_x, 1)) )
tf_b = tf.Variable(tf.zeros((1,1)))

# Training computation.
# We multiply the inputs with the weight matrix, and add biases. We compute
# the sigmoid and cross-entropy (it's one operation in TensorFlow, because
# it's very common, and it can be optimized). We take the average of this
# cross-entropy across all training examples: that's our cost.
tf_Z = tf.matmul(tf_X, tf_w) + tf_b
tf_J = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_Y, logits=tf_Z) )

# Optimizer.
# We are going to find the minimum of this loss using gradient descent.
# We pass alpha=0.1 as input parameter.
#optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(tf_J)
learning_rate = 0.01
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(tf_J)


# Predictions for the train and test data.
# These are not part of training, but merely here so that we can report
# accuracy figures as we train.
tf_A = tf.nn.sigmoid(tf_Z)
tf_A_test = tf.nn.sigmoid(tf.matmul(tf_X_test, tf_w) + tf_b)

In [None]:
num_steps = 3001
batch_size = 100

session = tf.InteractiveSession()
tf.global_variables_initializer().run()
print("Initialized")

for step in range(num_steps):
    # Pick an offset within the training data.
    offset = (step * batch_size) % (X.shape[0] - batch_size)
    
    # Generate a minibatch.
    X_batch = X[offset:(offset + batch_size), :]
    Y_batch = Y[offset:(offset + batch_size), :]
    
    _, J, A = session.run([optimizer, tf_J, tf_A], feed_dict={tf_X : X_batch, tf_Y : Y_batch})
    
    if (step % 500 == 0):
        print("Minibatch loss at step ", (step, J))
        print("Minibatch accuracy: ", accuracy(A, Y_batch))
        A_test = tf_A_test.eval()
        print("Test accuracy: ", accuracy(A_test,Y_test))

In [None]:
sgd_pred = A_test

## New Ensamble with all models

In [None]:
def ensamble(pred1, pred2, pred3, pred4, pred5, pred6, pred7):
    prediction = []
    for i in range(0, len(pred1)):
        p = (pred1[i] + pred2[i] + pred3[i] + pred4[i] + pred5[i] + pred6[i] + pred7[i])
        p = p/7
        if(p < 0.5):
            prediction.append(0)
        else:
            prediction.append(1)
    return prediction

In [None]:
print(linear_Y_pred, poly_Y_pred, gaussian_Y_pred, sigmoid_Y_pred, convert_pred(tensor_pred), convert_pred(nn_pred), convert_pred(sgd_pred))

In [None]:
def accuracy(pred, labels):
    count = 0
    for i in range(0, len(pred)):
        if(pred[i] == labels[i]):
            count += 1
    return count/len(pred)

In [None]:
ensamble_pred = ensamble(linear_Y_pred, poly_Y_pred, gaussian_Y_pred, sigmoid_Y_pred, 
                         convert_pred(tensor_pred), convert_pred(nn_pred), convert_pred(sgd_pred))

ensamble_acc = accuracy(ensamble_pred, Y_test)

print(ensamble_acc)