Logistic regression in TensorFlow
--

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
def normalize(data):
    max_data = np.max(data, axis=0)
    min_data = np.min(data, axis=0)
    stats = ['away_wins', 'away_losses', 'away_ot',
             'away_pts', 'away_ptPctg', 'away_goalsPerGame',
             'away_goalsAgainstPerGame', 'away_evGGARatio',
             'away_powerPlayPercentage', 'away_powerPlayGoals',
             'away_powerPlayGoalsAgainst', 'away_powerPlayOpportunities',
             'away_penaltyKillPercentage', 'away_shotsPerGame', 'away_shotsAllowed',
             'away_winScoreFirst', 'away_winOppScoreFirst', 'away_winLeadFirstPer',
             'away_winLeadSecondPer', 'away_winOutshootOpp', 'away_winOutshotByOpp',
             'away_faceOffsTaken', 'away_faceOffsWon', 'away_faceOffsLost',
             'away_faceOffWinPercentage', 'away_shootingPctg', 'away_savePctg',
             'home_wins', 'home_losses', 'home_ot', 'home_pts', 'home_ptPctg',
             'home_goalsPerGame', 'home_goalsAgainstPerGame', 'home_evGGARatio',
             'home_powerPlayPercentage', 'home_powerPlayGoals',
             'home_powerPlayGoalsAgainst', 'home_powerPlayOpportunities',
             'home_penaltyKillPercentage', 'home_shotsPerGame', 'home_shotsAllowed',
             'home_winScoreFirst', 'home_winOppScoreFirst', 'home_winLeadFirstPer',
             'home_winLeadSecondPer', 'home_winOutshootOpp', 'home_winOutshotByOpp',
             'home_faceOffsTaken', 'home_faceOffsWon', 'home_faceOffsLost',
             'home_faceOffWinPercentage', 'home_shootingPctg', 'home_savePctg']
    for stat in stats:
        data[stat] = (data[stat] - min_data[stat])/(max_data[stat] - min_data[stat])
    return data

In [3]:
def prepare(data):
    X = data.iloc[:,3:].values
    # we insert an all-ones column at index 0
    X = np.insert(X, 0, 1, axis=1)
    # get the first column of the data
    y = data.iloc[:,0:1].values
    return X,y

In [4]:
def split_train_test(X,y,pct=80):
    n = X.shape[0]
    s = round(n * pct / 100)
    
    indices = np.random.permutation(n)
    train_idx, test_idx = indices[:s], indices[s:]
    
    X_train, X_test = X[train_idx,:], X[test_idx,:]
    y_train, y_test = y[train_idx,:], y[test_idx,:]
    
    return X_train, y_train, X_test, y_test

In [5]:
data_2000_2001 = pd.read_csv('game_data/game_data_2000_2001.csv', header=0)
data_2001_2002 = pd.read_csv('game_data/game_data_2001_2002.csv', header=0)
data_2002_2003 = pd.read_csv('game_data/game_data_2002_2003.csv', header=0)
data_2003_2004 = pd.read_csv('game_data/game_data_2003_2004.csv', header=0)
data_2005_2006 = pd.read_csv('game_data/game_data_2005_2006.csv', header=0)
data_2006_2007 = pd.read_csv('game_data/game_data_2006_2007.csv', header=0)
data_2007_2008 = pd.read_csv('game_data/game_data_2007_2008.csv', header=0)
data_2008_2009 = pd.read_csv('game_data/game_data_2008_2009.csv', header=0)
data_2009_2010 = pd.read_csv('game_data/game_data_2009_2010.csv', header=0)
data_2010_2011 = pd.read_csv('game_data/game_data_2010_2011.csv', header=0)
data_2011_2012 = pd.read_csv('game_data/game_data_2011_2012.csv', header=0)
data_2012_2013 = pd.read_csv('game_data/game_data_2012_2013.csv', header=0)
data_2013_2014 = pd.read_csv('game_data/game_data_2013_2014.csv', header=0)
data_2014_2015 = pd.read_csv('game_data/game_data_2014_2015.csv', header=0)
data_2015_2016 = pd.read_csv('game_data/game_data_2015_2016.csv', header=0)
data_2016_2017 = pd.read_csv('game_data/game_data_2016_2017.csv', header=0)
data_2017_2018 = pd.read_csv('game_data/game_data_2017_2018.csv', header=0)

#each one of these data sets needs to be normalized 
data_2000_2001 = normalize(data_2000_2001)
data_2001_2002 = normalize(data_2001_2002)
data_2002_2003 = normalize(data_2002_2003)
data_2003_2004 = normalize(data_2003_2004)
data_2005_2006 = normalize(data_2005_2006)
data_2006_2007 = normalize(data_2006_2007)
data_2007_2008 = normalize(data_2007_2008)
data_2008_2009 = normalize(data_2008_2009)
data_2009_2010 = normalize(data_2009_2010)
data_2010_2011 = normalize(data_2010_2011)
data_2011_2012 = normalize(data_2011_2012)
data_2012_2013 = normalize(data_2012_2013)
data_2013_2014 = normalize(data_2013_2014)
data_2014_2015 = normalize(data_2014_2015)
data_2016_2017 = normalize(data_2016_2017)
data_2017_2018 = normalize(data_2017_2018)


frames = [data_2000_2001, data_2001_2002, data_2002_2003, data_2003_2004, data_2005_2006, 
          data_2006_2007, data_2007_2008, data_2008_2009, data_2009_2010, data_2010_2011, 
          data_2011_2012, data_2012_2013, data_2013_2014, data_2014_2015, data_2015_2016, 
          data_2016_2017, data_2017_2018]
data = pd.concat(frames)

X,y = prepare(data)

X,Y,X_test,Y_test = split_train_test(X,y,pct=80)

In [6]:
# We will reshape the Y arrays so that they are not rank 1 arrays but rank 2 arrays. 
# They should be rank 2 arrays.

Y = Y.reshape((Y.shape[0],1))
Y_test = Y_test.reshape((Y_test.shape[0],1))

print("Train dataset shape", X.shape, Y.shape)
print("Test dataset shape", X_test.shape, Y_test.shape)

print("Y =", Y)

m   = X.shape[0] 
n_x = X.shape[1]

Train dataset shape (17698, 55) (17698, 1)
Test dataset shape (4424, 55) (4424, 1)
Y = [[1]
 [0]
 [1]
 ..., 
 [1]
 [0]
 [1]]


In [7]:
def accuracy(A, Y):
    P = A>.5      #prediction
    num_agreements = np.sum(P==Y)
    return num_agreements / Y.shape[0]

In [8]:
# Input data.
# Load the training and test data into constants
tf_X = tf.constant(X.astype(np.float32))
tf_Y = tf.constant(Y.astype(np.float32))
tf_X_test = tf.constant(X_test.astype(np.float32))
tf_Y_test = tf.constant(Y_test.astype(np.float32))

# Variables.
# These are the parameters that we are going to be training.
tf_w = tf.Variable(tf.zeros((n_x, 1)))
tf_b = tf.Variable(tf.zeros((1,1)))

# Training computation.
# We multiply the inputs with the weight matrix, and add biases. We compute
# the sigmoid and cross-entropy (it's one operation in TensorFlow, because
# it's very common, and it can be optimized). We take the average of this
# cross-entropy across all training examples: that's our cost.
tf_Z = tf.matmul(tf_X, tf_w) + tf_b
tf_J = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_Y, logits=tf_Z) )

# Optimizer.
# We are going to find the minimum of this loss using gradient descent.
# We pass alpha=0.1 as input parameter.
optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(tf_J)

# Predictions for the train and test data.
# These are not part of training, but merely here so that we can report
# accuracy figures as we train.
tf_A = tf.nn.sigmoid(tf_Z)
tf_A_test = tf.nn.sigmoid(tf.matmul(tf_X_test, tf_w) + tf_b)

In [9]:
session = tf.InteractiveSession()

# This is a one-time operation which ensures the parameters get initialized as
# we described in the graph: random weights for the matrix, zeros for the biases. 
tf.global_variables_initializer().run()
print("Initialized")

for iter in range(1000):
    # Run the computations. We tell .run() that we want to run the optimizer,
    # and get the cost value and the training predictions returned as numpy arrays.
    _, J, A = session.run([optimizer, tf_J, tf_A])
    
    print(iter, J)

Initialized
0 0.693131
1 396.354
2 6038.19
3 1191.65
4 5135.42
5 1984.2
6 4232.74
7 2776.69
8 3330.06
9 3569.17
10 2427.39
11 4361.64
12 1524.72
13 5154.05
14 622.19
15 5944.59
16 244.399
17 6208.73
18 1041.77
19 5305.93
20 1834.35
21 4403.24
22 2626.84
23 3500.57
24 3419.31
25 2597.88
26 4211.79
27 1695.22
28 5004.21
29 792.653
30 5795.32
31 95.2466
32 6375.49
33 895.198
34 5472.64
35 1687.82
36 4569.96
37 2480.31
38 3667.28
39 3272.78
40 2764.61
41 4065.26
42 1861.94
43 4857.68
44 959.371
45 5648.97
46 59.0815
47 6422.54
48 722.178
49 5668.88
50 1515.39
51 4766.18
52 2307.9
53 3863.49
54 3100.38
55 2960.82
56 3892.86
57 2058.14
58 4685.32
59 1155.5
60 5477.46
61 253.577
62 6261.34
63 561.042
64 5850.75
65 1355.58
66 4947.98
67 2148.14
68 4045.29
69 2940.62
70 3142.61
71 3733.1
72 2239.94
73 4525.57
74 1337.28
75 5317.86
76 435.008
77 6105.74
78 405.545
79 6025.22
80 1202.25
81 5122.38
82 1994.87
83 4219.68
84 2787.36
85 3317.01
86 3579.84
87 2414.33
88 4372.31
89 1511.67
90 5164.64
9

767 938.484
768 5654.57
769 54.2569
770 4599.98
771 1247.68
772 5385.59
773 354.131
774 6165.83
775 467.701
776 5942.91
777 1270.2
778 5034.0
779 2067.48
780 4128.81
781 2861.68
782 3225.72
783 3653.78
784 2324.3
785 4443.66
786 1425.42
787 5230.13
788 530.799
789 6011.57
790 313.772
791 6117.2
792 1117.25
793 5207.43
794 1915.2
795 4301.77
796 2709.79
797 3398.41
798 3502.18
799 2496.76
800 4292.37
801 1597.5
802 5079.38
803 702.214
804 5861.61
805 164.08
806 6286.94
807 968.252
808 5376.51
809 1766.71
810 4470.46
811 2561.61
812 3566.89
813 3354.21
814 2665.1
815 4144.61
816 1765.62
817 4931.95
818 869.907
819 5714.72
820 37.5308
821 1523.38
822 5144.22
823 628.426
824 5926.12
825 228.537
826 6213.76
827 1032.46
828 5303.49
829 1830.79
830 4397.51
831 2625.62
832 3493.99
833 3418.14
834 2592.28
835 4208.41
836 1692.96
837 4995.56
838 797.499
839 5778.08
840 81.1619
841 6132.53
842 1103.53
843 5223.02
844 1901.24
845 4317.46
846 2695.7
847 3414.24
848 3487.86
849 2512.89
850 4277.67
8

In [10]:
# Calling .eval() is basically like calling run(), but
# just to get that one numpy array. 
# Note that it recomputes all its computation graph dependencies.
A = tf_A.eval()
A_test = tf_A_test.eval()

print("Accuracy on the train set is ", accuracy(A,Y))
print("Accuracy on the test set is ", accuracy(A_test,Y_test))

Accuracy on the train set is  0.57181602441
Accuracy on the test set is  0.577983725136
