# Neural Net Implementation

### Import Statements

In [1]:
import pandas as pd
import pickle
import tensorflow as tf
import numpy as np
from copy import deepcopy

from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler

## Get Data for Neural Net

### Load Previous data and initialize training lists

In [2]:
#Loads the 'fixed' results and match schedule information
with open('fixed_match_dates_and_scores.pkl', 'rb') as f:
    results, match_dates = pickle.load(f)

resultsSGD = deepcopy(results)
    
#Read in halftime match stats
home_dfNN = pd.read_csv('NN/NN_home.csv', index_col=0)
away_dfNN = pd.read_csv('NN/NN_away.csv', index_col=0)

home_dfSGD = pd.read_csv('sgd_home.csv', index_col=0)
away_dfSGD = pd.read_csv('sgd_away.csv', index_col=0)

count, total = 0, 0
X_train, X_test, y_train, y_test, X_trainSGD, X_testSGD, half_time = [], [], [], [], [], [], []

#Set train/test split to 80/20
test_train_limit = match_dates[0]+ int((match_dates[-1]-match_dates[0])*(4/5))

#Loads the SGD Classifier and training/testing data created by SGDClassifier.ipynb
with open('saved_SGD_&_train_test_data2.pkl', 'rb') as fid:
    clf, _, _, _, _, _ = pickle.load(fid)

### Name Fixing Function for Preprocessing for Halftime Data

In [3]:
#Gives a score based on how much temp_name matches each name in namelist
#The score is simply +1 for each consecutive matching letter
#The longest length of consecutive matching letters is the score for each name in namelist
def name_fix(temp_name, namelist):

    #Special cases:
    if(temp_name == 'EspanyolBarcelona'): return 'RCDEspanyol'
    if(temp_name == 'QueensParkRangers'): return 'QPR'
    if(temp_name == 'DeportivoLaCoruna'): return 'Dep.'
    if(temp_name == 'LeicesterCity'): return 'Leicester'
    if(temp_name == 'AtleticoMadrid'): return 'Atl.'
    if(temp_name == 'BorussiaMonchengladbach'): return 'B.'
   
    
    score = []
    for elem in namelist:
        count = 0
        i = 0
        curr_word = ''
        letters = list(temp_name)
        
        #While the score is less than the length of the word in question
        while i < len(letters):
            #Increase the segment you are checking for matches by 1 letter
            curr_word += letters[i]
            
            #If the new segment is in the name from namelist and it is longer than the
            #max segment so far, store this as the maximum
            if((curr_word in elem)): 
                if(count < len(curr_word)): 
                    count = len(curr_word)
                    
            #If the segment is not contained in the name, reset
            elif(len(curr_word) != 1): 
                curr_word = ''
                i-=1
            
            i+=1    

        score.append(count)
        
    #The name with the highest score is returned    
    return namelist[np.argmax(score)]

### Use the Name Fixing function above to clean up the names

In [4]:
#Get list of team names according to the fulltime data
match_id_list_home = list(home_dfNN.index)
match_id_list_away = list(away_dfNN.index)
match_id_list_home = list(set([elem.split('_')[0] for elem in match_id_list_home]))
match_id_list_away = list(set([elem.split('_')[0] for elem in match_id_list_away]))


#The current list is fixed in relation to the fulltime data
#We need to 'fix' it in relation to the NN data
#At the end of this we will have data that does not need to be fixed any more so in the future
#but for the sake of submission I've left in this step (which doesn't do any harm)
print(results[5331])

#For each date where a game was played
for date in match_dates:
    #For each match played that day
    for match in results[date]:
            #Replace the name in the schedule with the equivalent name in the fulltime data
            match[0] = name_fix(match[0], match_id_list_home)
            match[1] = name_fix(match[1], match_id_list_away)

#Save our fixed data
with open('final_fixed_match_dates_and_scores.pkl', 'wb') as fid:
    pickle.dump((results, match_dates), fid)    

#Fixed names to verify visually
print(results[5331])

[['Bastia', 'Marseille', (1, 2), (3, 3)], ['EvianThononGaillard', 'Caen', (0, 3), (0, 3)], ['Guingamp', 'SaintEtienne', (0, 1), (0, 2)], ['Lille', 'Metz', (0, 0), (0, 0)], ['Montpellier', 'Bordeaux', (0, 1), (0, 1)], ['Nantes', 'Lens', (0, 0), (1, 0)], ['Nice', 'Toulouse', (1, 2), (3, 2)]]
[['Bastia', 'Marseille', (1, 2), (3, 3)], ['EvianTG', 'Caen', (0, 3), (0, 3)], ['Guingamp', 'StEtienne', (0, 1), (0, 2)], ['Lille', 'Metz', (0, 0), (0, 0)], ['Montpellier', 'Bordeaux', (0, 1), (0, 1)], ['Nantes', 'Lens', (0, 0), (1, 0)], ['Nice', 'Toulouse', (1, 2), (3, 2)]]


### Reading in from the Dataframes, fill Training and Testing variables with the appropriate data

In [5]:
total = 0

#For each date where a game was played
for date in match_dates:
    #For each match played that day
    for matchNN, matchSGD in zip(results[date], resultsSGD[date]):
        
        #Initialize for each match
        temp_vec = 0
        flag = [False, False]
        home_nameNN = matchNN[0]
        away_nameNN = matchNN[1]
        
        home_nameSGD = matchSGD[0]
        away_nameSGD = matchSGD[1]


        #Only look for data from prior to the match date for the Home team
        #This way we do not use data from the match in our prediction of the match
        temp_date = date - 1

        #Iterate through the previous match days until you find a prior set of fulltime data
        #In this dataframe the fulltime data is a rolling average of the previous fulltime data in the same season
        while(temp_date > match_dates[0]):
            #if there is data for the home team on temp_date day then load it
            try:
                temp_vec = [[float(elem) for elem in (home_dfSGD.loc[home_nameSGD+'_'+str(temp_date), :]).as_matrix()]]
                
                #When we find the data, indicate that we have found it and exit this loop
                flag[0] = True
                break
            #if there is no data, check the day before
            except:
                temp_date -=1
                

        #Reset and Reuse
        temp_date = date -1

        #Identical to above but for the Away Team
        while(temp_date > match_dates[0] and flag[0] == True): 
            try:    
                temp_vec.append([float(elem) for elem in away_dfSGD.loc[away_nameSGD+'_'+str(temp_date), :].as_matrix()])
                flag[1] = True
                break
            except:
                temp_date -=1
                
        #If we have found prior data for both teams
        if(flag[0] and flag[1]):
            #Since match data is chronologically ordered, the earlier matches will be placed in 
            #the train category and the later matches in the test category
            try:
                #Since we found historical fulltime data for the match
                #let us look for halftime data for the match (almost all, but not all matches had halftime data recorded)
                vec = [[float(0) if elem == 'N' or elem == 'n' else float(elem) for elem in (home_dfNN.loc[home_nameNN+'_'+str(date), :]).as_matrix()]]
                vec.append([float(0) if elem == 'N' or elem == 'n' else float(elem) for elem in away_dfNN.loc[away_nameNN+'_'+str(date), :].as_matrix()])
                halftime_score = [float(elem) for elem in matchNN[2]]
                vec[0].append(halftime_score[0])
                vec[1].append(halftime_score[1])
            
                #If lower than the train/test limit add it to the training data
                #otherwise add it to the testing data
                if(date < test_train_limit):
                    X_train.append(vec)
                    X_trainSGD.append(temp_vec)
                    y_train.append(list(matchNN[3]))
                else:
                    X_testSGD.append(temp_vec)
                    y_test.append(list(matchNN[3]))
                    X_test.append(vec)
                    
                    #Beginning creation of the Benchmark
                    half_time.append(list(matchNN[2]))
            except:
                print(home_nameNN+'_'+str(date), away_nameNN+'_'+str(date))
                total +=1
                
#A small number of matches did not have half time data. These matches are listed below.
print("Missing Halftime data for", total, "matches")
print("Data Shape:", len(X_train), len(X_test), len(y_train), len(y_test))

Parma_5415 ACMilan_5415
ACMilan_5423 Verona_5423
ACMilan_5437 ACMilan_5437
ASRoma_5444 ACMilan_5444
ACMilan_5451 Udinese_5451
Chievo_5459 ACMilan_5459
ACMilan_5465 Lazio_5465
ACMilan_5486 Genoa_5486
Empoli_5492 ACMilan_5492
Sassuolo_5507 ACMilan_5507
ACMilan_5514 Palermo_5514
ACMilan_5535 Fiorentina_5535
ACMilan_5549 Cesena_5549
Sampdoria_5556 ACMilan_5556
ACMilan_5569 Parma_5569
Verona_5576 ACMilan_5576
ACMilan_5584 ACMilan_5584
Udinese_5593 ACMilan_5593
ACMilan_5590 ASRoma_5590
ACMilan_5598 Chievo_5598
Lazio_5605 ACMilan_5605
ACMilan_5611 Juventus_5611
ACMilan_5500 Torino_5500
Atalanta_5521 ACMilan_5521
Cagliari_5529 ACMilan_5529
Napoli_5542 ACMilan_5542
Genoa_5618 ACMilan_5618
ACMilan_5412 Sampdoria_5412
Juventus_5481 ACMilan_5481
ACMilan_5626 Empoli_5626
ACMilan_5710 Atalanta_5710
ACMilan_5731 ACMilan_5731
Chievo_5738 ACMilan_5738
ACMilan_5745 Fiorentina_5745
Sampdoria_5752 ACMilan_5752
ACMilan_5766 Juventus_5766
Palermo_5772 ACMilan_5772
ACMilan_5779 ASRoma_5779
Torino_5787 ACMila

### Format all Data for use in the NN and SGD, also create Benchmark model

In [6]:
#Convert the data into numpy arrays
X_train, X_test, X_trainSGD, X_testSGD = np.array(X_train), np.array(X_test), np.array(X_trainSGD), np.array(X_testSGD)
y_train, y_test = np.array(y_train), np.array(y_test)
half_time = np.array(half_time)

print(X_train.shape)
print("Pre-Shape Xtrain:", X_train[0])

#Classifier Replace all NaN Values with 0
#This makes sense because some values (eg: red cards or yellow cards) when scraped were represented with 
#empty space rather than a 0
X_train = np.nan_to_num(X_train.reshape(X_train.shape[0], X_train.shape[1]*X_train.shape[2]))
X_test = np.nan_to_num(X_test.reshape(X_test.shape[0], X_test.shape[1]*X_test.shape[2]))
X_trainSGD = np.nan_to_num(X_trainSGD.reshape(X_trainSGD.shape[0], X_trainSGD.shape[1]*X_trainSGD.shape[2]))
X_testSGD = np.nan_to_num(X_testSGD.reshape(X_testSGD.shape[0], X_testSGD.shape[1]*X_testSGD.shape[2]))

print(X_train.shape)
print("Re-Shape Xtrain:", X_train[0])


#MultilabelBinarizer to convert scores into a binary vector
print("Pre-Binarizing:", y_train[0])
mlb = MultiLabelBinarizer(classes = range(0,15))
y_train = mlb.fit_transform(y_train)
y_test = mlb.fit_transform(y_test)
print("Post-Binarizing:", y_train[0])


(3480, 2, 11)
Pre-Shape Xtrain: [[ 40.   1.   3.   0.   6.   0.   0.  13.   6.   6.   0.]
 [ 60.   6.   6.   0.   4.   3.   0.  21.   1.   6.   0.]]
(3480, 22)
Re-Shape Xtrain: [ 40.   1.   3.   0.   6.   0.   0.  13.   6.   6.   0.  60.   6.   6.   0.
   4.   3.   0.  21.   1.   6.   0.]
Pre-Binarizing: [1 2]
Post-Binarizing: [0 1 1 0 0 0 0 0 0 0 0 0 0 0 0]


In [7]:
#Create the benchmark predictions by multiplying the half-time scores by 2
#The benchmark prediction is that the same result of the first half will occur in the second half
benchmark = [[2*i, 2*ii] for i, ii in half_time]
print("Pre-Fit", benchmark[0:2])
benchmark = mlb.fit_transform(benchmark)

print("Post-fit Benchmark:", benchmark[0:2])


Pre-Fit [[4, 0], [6, 2]]
Post-fit Benchmark: [[1 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 1 0 0 0 0 0 0 0 0]]


### Create the Scoring Function

In [8]:
#Calculates the total goals in the match described by the outputted vector
def total_goals(goals):

    total_goals = sum([elem*i for i, elem in enumerate(goals)])
    return total_goals

#Calculates the score described by the outputted vector
def score_func(pred, actual):
    score = 0
    
    #10 points for exact score
    if(np.array_equal(pred, actual)):
        #print("10 Points:", pred, actual)
        return 10

    #2 points for 1 correct score
    #Subtract 1 array from the other, as they are not identical
    #the result is either 2, 3 or 4 nonzero elements, 2 or 3 indicates that 1 index matched
    if(np.count_nonzero(pred-actual) <= 3):
        return 2

    #5 points for correct total goals
    if(total_goals(pred) == total_goals(actual)):
        return 5

    #print(score, " Points:", pred, actual)
    return score

### Save and Display the Formatted Training/Testing data and Format the SGD training/testing data for last check before running the NN

In [9]:
#Make predictions using the SGD to feed into the NN
X_trainSGD = np.array(X_trainSGD)
X_testSGD = np.array(X_testSGD)
SGD_train = clf.predict(X_trainSGD)         
SGD_test = clf.predict(X_testSGD)

#Display a sample of each of these datas just to visually check everything 
#is going smoothly one last time
print(SGD_train.shape, SGD_test.shape)
print(X_train[0])
print(X_test[0])
print(y_train[0])
print(y_test[0])

#Save the NN training/testing data
with open('NN_Data_Struct.pkl', 'wb') as fid:
    pickle.dump((X_train, X_test, y_train, y_test), fid) 

(3480, 15) (1448, 15)
[ 40.   1.   3.   0.   6.   0.   0.  13.   6.   6.   0.  60.   6.   6.   0.
   4.   3.   0.  21.   1.   6.   0.]
[ 70.   3.   4.   2.   8.   5.   0.   0.   0.   2.   2.  30.   0.   1.   1.
   2.   2.   1.   0.   1.   7.   0.]
[0 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
[1 0 0 1 0 0 0 0 0 0 0 0 0 0 0]


### Get the Training/Testing data ready for NN

In [10]:
#Reload the complete training/testing data
with open('NN_Data_Struct.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)
 

#Add the predicted ouptut from the SGDClassifier to the 
#Training and testing feature vectors for the Neural Network
X_trainf = [np.append(i, ii) for i, ii in zip(X_train, SGD_train)]
X_testf = [np.append(i, ii) for i, ii in zip(X_test, SGD_test)]
scaler = MinMaxScaler()
scaler.fit(X_trainf)
X_train = scaler.transform(X_trainf)
X_test = scaler.transform(X_testf)


### Set all Nerual Network Parameters and Create the NN

In [11]:
#Set the number of nodes at each layer
nodes1 = 1024
nodes2 = 512
input_nodes = X_train.shape[1]
output_nodes = y_train.shape[1]
X = tf.placeholder("float", [None, input_nodes])
Y = tf.placeholder("float", [None, output_nodes])

# Create weights and biases for each layer
W1 = tf.Variable(tf.random_normal([input_nodes, nodes1]))
W2 = tf.Variable(tf.random_normal([nodes1, nodes2]))
W3 = tf.Variable(tf.random_normal([nodes2, output_nodes]))
B1 = tf.Variable(tf.random_normal([nodes1]))
B2 = tf.Variable(tf.random_normal([nodes2]))
B3 = tf.Variable(tf.random_normal([output_nodes]))

#Create 2 hidden layers and the output layer
def propogate(x):
    layer_1 = tf.add(tf.matmul(x, W1), B1)
    layer_2 = tf.add(tf.matmul(layer_1, W2), B2)
    output_layer = tf.matmul(layer_2, W3) + B3
    return output_layer

logits = propogate(X)

#Compute softmax cross entropy between logits and labels
cross_entropy= tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y))

#Adagrad for gradient-based optimization
optimizer = tf.train.AdagradOptimizer(learning_rate=0.0001)

#Compute the gradients and apply them to the variables
optimize = optimizer.minimize(cross_entropy)


### Run Neural Net and Display Results

In [12]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    #For 45 epochs
    for epoch in range(40):
        avgCost = 0
        
        #For each feature/label combo in the training set
        for i in range(len(X_train)):
           
            #Set the current feature/label combo
            x, y = X_train[i: i + 1], y_train[i: i + 1]
            
            #Call the Loss function and AdagradOptimizer for it
            _, cost = sess.run([optimize, cross_entropy], feed_dict={X: x, Y: y})
            
            #Calculate average cost so we can monitor what the NN is up to
            avgCost+= cost / len(X_train)
            
        print("Epoch:", (epoch+1), "| Cost:", avgCost)

    #Find the predicted value for all X_test values by calling sess.run on the softmax of output node
    output = tf.nn.softmax(logits)
    pred = sess.run(output, feed_dict={X: X_test})
    
    #If there is only one 1 in the vector we should double it inorder to indicate both teams scored that amount
    benchmark = [2*elem if np.count_nonzero(elem) == 1 else elem for elem in benchmark]
    y_test = [2*elem if np.count_nonzero(elem) == 1 else elem for elem in y_test]
    
    
    points, temp = [0,0]
    score_dict = dict({0: 0, 2: 0, 5: 0, 10: 0})
    
    
    #Calculate the score for the NN & SGD model
    for i in range(len(pred)):
        p = pred[i]
        t = y_test[i]     
        s = SGD_test[i]

        #One last way of incorporating the SGD predictions was to
        #average them into the neural network output and then round off
        #the answer to the nearest whole number
        #The import thing is not the actual number but the magnitude of it in comparison the rest
        final = np.mean(np.array([p, s]), axis = 0)
        final = [round(elem) for elem in final]
        
        #If we have 3+ nonzero numbers, we need to get down to 1 or 2
        #This problem was discussed in the SGD and seems to be a byproduct of using
        #the average.
        while(np.count_nonzero(final) not in [0, 1, 2]):
             final[np.random.choice(np.nonzero(final)[0])] = 0
        
        #If any of the newly created vectors is all zeroes just use the original
        #Neural network output instead
        for ind in range(len(final)):
            if(np.count_nonzero(final[ind]) == 0):
                final = p
                
        #Only 1 index means both teams scored that amount so we must double it
        final = [2*elem if np.count_nonzero(elem) == 1 else elem for elem in final]
        
        
        
        #Perform the Scoring
        temp = score_func(final, t)
        score_dict[temp] +=1
        points += temp
    
   
    points2, temp2 = [0,0]
    score_dict2 = dict({0: 0, 2: 0, 5: 0, 10: 0})
    
    #Calculate the score for the benchmark model
    for t, p in zip(y_test, benchmark):
           
        #Perform the Scoring
        temp2 = score_func(p, t)
        score_dict2[temp2] +=1
        points2 += temp2

    
    #Display Results
    print("----------------")
    print("Total Score")
    print("----------------")
    print("NN & SGD:  ", points)
    print("Benchmark: ", points2)
    print("----------------")
    print("Stats: ")
    for k in score_dict.keys():
        print("NN & SGD:  Matches where", k, "points awarded: ", score_dict[k])
        print("Benchmark: Matches where", k, "points awarded: ", score_dict2[k], "\n")

Epoch: 1 | Cost: 1636.73171175
Epoch: 2 | Cost: 1123.17022609
Epoch: 3 | Cost: 1084.88874579
Epoch: 4 | Cost: 1070.78461277
Epoch: 5 | Cost: 1062.74074622
Epoch: 6 | Cost: 1057.42403716
Epoch: 7 | Cost: 1053.58618051
Epoch: 8 | Cost: 1050.71861345
Epoch: 9 | Cost: 1048.83935005
Epoch: 10 | Cost: 1047.67709721
Epoch: 11 | Cost: 1046.93288
Epoch: 12 | Cost: 1046.58740744
Epoch: 13 | Cost: 1046.6712303
Epoch: 14 | Cost: 1047.04009024
Epoch: 15 | Cost: 1047.6234768
Epoch: 16 | Cost: 1048.22991215
Epoch: 17 | Cost: 1048.92277569
Epoch: 18 | Cost: 1049.65118354
Epoch: 19 | Cost: 1050.51871088
Epoch: 20 | Cost: 1051.59815769
Epoch: 21 | Cost: 1052.873041
Epoch: 22 | Cost: 1054.1458487
Epoch: 23 | Cost: 1055.37768176
Epoch: 24 | Cost: 1056.73049932
Epoch: 25 | Cost: 1058.082202
Epoch: 26 | Cost: 1059.46961873
Epoch: 27 | Cost: 1060.95680835
Epoch: 28 | Cost: 1062.5410683
Epoch: 29 | Cost: 1064.14960839
Epoch: 30 | Cost: 1065.7932029
Epoch: 31 | Cost: 1067.49799814
Epoch: 32 | Cost: 1069.195628

### Conclusion

The combined NN & SGD model was still unable to surpass the benchmark model. It did however make a huge improvement compared to only the SGDClassifier. The original difference between the benchmark and SGDClassifier was 1515 points, now with the NN & SGD model it is only 765. Described differently the SGDClassifier model had 66.76% of the points of the benchmark, while the final model had 83.20% of the benchmark. The benchmark scores are slightly different between the two comparisons because there were fewer points avaiable in this most recent test as some halftime match data was not available. 

The only category where the NN & SGD model was able to surpass the benchmark was the 2 point category (where only 1 score matched the true result). Unfortunately, the most important categories for predicting the number of goals or the precise score correctly were dominated by the benchmark model. I tried a variety of methods to improve this (detailed in the report), however the main conclusion I reached is that for this problem, doubling the halftime score is an incredibly good benchmark. It was able to predict the precise correct fulltime score in 15.21% of matches. That's an incredibly high rate for such a simple statistic! In fact, 49.42% of the total points collected by the benchmark were earned that way. In comparison, only 29.30% of the NN & SGD model's points were collected through a precise fulltime score prediction. Maybe I will use the doubled halftimescore as part of the model the next time I approach this problem!