# SGD Classifier Implementation

### Import Statements

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
import re

import warnings
warnings.filterwarnings('ignore')

from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
#from sklearn.svm import SVC

### Name Fixing Function for Preprocessing

In [2]:
#Gives a score based on how much temp_name matches each name in namelist
#The score is simply +1 for each consecutive matching letter
#The longest length of consecutive matching letters is the score for each name in namelist
def name_fix(temp_name, namelist):

    #Special case:
    if(temp_name == 'EspanyolBarcelona'): return 'RCDEspanyol'
    if(temp_name == 'ChievoVerona'): return 'Chievo'
    if(temp_name == 'HellasVerona'): return 'Verona'
    
    score = []
    for elem in namelist:
        count = 0
        i = 0
        curr_word = ''
        letters = list(temp_name)
        
        #While the score is less than the length of the word in question
        while i < len(letters):
            #Increase the segment you are checking for matches by 1 letter
            curr_word += letters[i]
            
            #If the new segment is in the name from namelist and it is longer than the
            #max segment so far, store this as the maximum
            if((curr_word in elem)): 
                if(count < len(curr_word)): 
                    count = len(curr_word)
                    
            #If the segment is not contained in the name, reset
            elif(len(curr_word) != 1): 
                curr_word = ''
                i-=1
            
            i+=1    

        score.append(count)
        
    #The name with the highest score is returned    
    return namelist[np.argmax(score)]

### Read in & Initialize all of the required data

In [3]:
home_path = 'sgd_home.csv'
away_path = 'sgd_away.csv'

#Read in fulltime match stats
home_df = pd.read_csv(home_path, index_col=0)
away_df = pd.read_csv(away_path, index_col=0)

#Read in match information
with open('match_dates_and_scores.pickle', 'rb') as f:
    results, match_dates = pickle.load(f)

count, total = 0, 0
x, y, X_train, X_test, y_train, y_test, half_time = [], [], [] ,[], [], [], []

#Set train/test split to 80/20
test_train_limit = match_dates[0]+ int((match_dates[-1]-match_dates[0])*(4/5))

### Use the Name Fixing function above to clean up the names

In [4]:
#Get list of team names according to the fulltime data
match_id_list_home = list(home_df.index)
match_id_list_away = list(away_df.index)
match_id_list_home = list(set([elem.split('_')[0] for elem in match_id_list_home]))
match_id_list_away = list(set([elem.split('_')[0] for elem in match_id_list_away]))

#Un-fixed names to verify visually
print(results[5331])

#For each date where a game was played
for date in match_dates:
    #For each match played that day
    for match in results[date]:
            #Replace the name in the schedule with the equivalent name in the fulltime data
            match[0] = name_fix(match[0], match_id_list_home)
            match[1] = name_fix(match[1], match_id_list_away)

#Save our fixed data
with open('fixed_match_dates_and_scores.pkl', 'wb') as fid:
    pickle.dump((results, match_dates), fid)    

#Fixed names to verify visually
print(results[5331])

[['SCBastia', 'OlympiqueMarseille', (1, 2), (3, 3)], ['ÉvianThononGaillard', 'SMCaen', (0, 3), (0, 3)], ['EAGuingamp', 'ASSaint-Étienne', (0, 1), (0, 2)], ['LilleOSC', 'FCMetz', (0, 0), (0, 0)], ['MontpellierHSC', 'GirondinsBordeaux', (0, 1), (0, 1)], ['FCNantes', 'RCLens', (0, 0), (1, 0)], ['OGCNice', 'ToulouseFC', (1, 2), (3, 2)]]
[['Bastia', 'Marseille', (1, 2), (3, 3)], ['EvianThononGaillard', 'Caen', (0, 3), (0, 3)], ['Guingamp', 'SaintEtienne', (0, 1), (0, 2)], ['Lille', 'Metz', (0, 0), (0, 0)], ['Montpellier', 'Bordeaux', (0, 1), (0, 1)], ['Nantes', 'Lens', (0, 0), (1, 0)], ['Nice', 'Toulouse', (1, 2), (3, 2)]]


### Reading in from the Dataframes, fill X_train, X_test, y_train, y_test with the appropriate data

In [5]:
#For each date where a game was played
for date in match_dates:
    #For each match played that day
    for match in results[date]:
        
        #Initialize for each match
        temp_vec = 0
        flag = [False, False]
        home_name = match[0]
        away_name = match[1]

        #Only look for data from prior to the match date for the Home team
        #This way we do not use data from the match in our prediction of the match
        temp_date = date - 1

        #Iterate through the previous match days until you find a prior set of fulltime data
        #In this dataframe the fulltime data is a rolling average of the previous fulltime data in the same season
        while(temp_date > match_dates[0]):
            #if there is data for the home team on temp_date day then load it
            try:
                temp_vec = [[float(elem) for elem in (home_df.loc[home_name+'_'+str(temp_date), :]).as_matrix()]]
                
                #When we find the data, indicate that we have found it and exit this loop
                flag[0] = True
                break
            #if there is no data, check the day before
            except:
                temp_date -=1
                

        #Reset and Reuse
        temp_date = date -1

        #Identical to above but for the Away Team
        while(temp_date > match_dates[0] and flag[0] == True): 
            try:    
                temp_vec.append([float(elem) for elem in away_df.loc[away_name+'_'+str(temp_date), :].as_matrix()])
                flag[1] = True
                break
            except:
                temp_date -=1
                
        #If we have found prior data for both teams (we won't for the first 2 weeks of our data)
        #In addition, 3 teams in each league each year are newly promoted from the 2nd division
        #they will have no prior data
        if(flag[0] and flag[1]):
            #Since match data is chronologically ordered, the earlier matches will be placed in 
            #the train category and the later matches in the test category
            if(date < test_train_limit):
                X_train.append(temp_vec)
                y_train.append(list(match[3]))
            else:
                X_test.append(temp_vec)
                y_test.append(list(match[3]))
                
                #Beginning creation of the Benchmark
                half_time.append(list(match[2]))
                
                
print(len(X_train), len(X_test), len(y_train), len(y_test))

3554 1482 3554 1482


### Create and Save the SGD Classifier

In [6]:
#Convert the data into numpy arrays
X_train, X_test = np.array(X_train), np.array(X_test) 
y_train, y_test = np.array(y_train), np.array(y_test)
half_time = np.array(half_time)

#Classifier Replace all NaN Values with 0
#This makes sense because some values (eg: red cards or yellow cards) when scraped were represented with 
#empty space rather than a 0
print("Pre-Shape:", X_train[0])
X_train = np.nan_to_num(X_train.reshape(X_train.shape[0], X_train.shape[1]*X_train.shape[2]))
X_test = np.nan_to_num(X_test.reshape(X_test.shape[0], X_test.shape[1]*X_test.shape[2]))
print("Re-Shape:", X_train[0])

#MultilabelBinarizer to convert scores into a vector
print("Pre-Binarizing:", y_train[0])
mlb = MultiLabelBinarizer(classes = range(0,15))
y_train = mlb.fit_transform(y_train)
y_test = mlb.fit_transform(y_test)
print("Post-Binarizing:", y_train[0])

#Create the benchmark predictions by multiplying the half-time scores by 2
#The benchmark prediction is that the same result of the first half will occur in the second half
benchmark = [[2*i, 2*ii] for i, ii in half_time]
benchmark = mlb.fit_transform(benchmark)
print(len(benchmark))

#Create OneVsRest SGDClassifier
clf = OneVsRestClassifier(SGDClassifier(loss='modified_huber', penalty='elasticnet',
                                          alpha=1e-4, n_iter=5, random_state=42,
                                          shuffle=True, n_jobs=-1) )

#Fit to our training data
clf.fit(X_train, y_train)

#Saved with '2' after the name because my report and data uses the original and I do not want to overwrite it
#as it makes understanding the report + results much easier
with open('saved_SGD_&_train_test_data2.pkl', 'wb') as fid:
    pickle.dump((clf, X_train, X_test, y_train, y_test, benchmark), fid,)

Pre-Shape: [[  32.    7.   14.    3.   10.    2.   nan   20.    9.    6.    1.   19.
     7.   21.  343.   78.]
 [  60.   14.   21.   11.    7.    7.    0.   33.   22.    3.    3.   21.
    13.   25.  504.   85.]]
Re-Shape: [  32.    7.   14.    3.   10.    2.    0.   20.    9.    6.    1.   19.
    7.   21.  343.   78.   60.   14.   21.   11.    7.    7.    0.   33.
   22.    3.    3.   21.   13.   25.  504.   85.]
Pre-Binarizing: [1 2]
Post-Binarizing: [0 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
1482


### Test SGDClassifier and compare to Benchmark

In [7]:
#Calculates the total goals in the match described by the outputted vector
def total_goals(goals):

    total_goals = sum([elem*i for i, elem in enumerate(goals)])
    return total_goals

#Calculates the score described by the outputted vector
def score_func(pred, actual):
    score = 0

    #10 points for exact score
    if(np.array_equal(pred, actual)):
        #print("10 Points:", pred, actual)
        return 10

    #2 points for 1 correct score
    #Subtract 1 array from the other, as they are not identical
    #the result is either 2, 3 or 4 nonzero elements, 2 or 3 indicates a matching index
    if(np.count_nonzero(pred-actual) <= 3):
        return 2

    #5 points for correct total goals
    if(total_goals(pred) == total_goals(actual)):
        return 5

    #print(score, " Points:", pred, actual)
    return score

In [8]:
#Load data from the original saved_SGD_&_train_test_data.pkl
with open('saved_SGD_&_train_test_data2.pkl', 'rb') as fid:
    clf, X_train, X_test, y_train, y_test, benchmark = pickle.load(fid)

#Make Predictions on X_test
pred = clf.predict(X_test)
#Since the output is a binary vector if there is only one 1 in it that means both teams scored that amount
#For example: 0100 means the game was 1-1, in this case we need to change the 1 to a 2 for our scoring method to be accurate
pred = [2*elem if np.count_nonzero(elem) == 1 else elem for elem in pred]
benchmark = [2*elem if np.count_nonzero(elem) == 1 else elem for elem in benchmark]
y_test = [2*elem if np.count_nonzero(elem) == 1 else elem for elem in y_test]

print(len(pred), len(benchmark), len(y_test), len(X_test), len(X_train))

#Initialize Scoring variables
benchmark_score = 0
score= 0
score_dict = dict({0: 0, 2: 0, 5: 0, 10: 0})
benchmark_dict = dict({0: 0, 2: 0, 5: 0, 10: 0})

for i in range(len(y_test)): 

    #One flaw in this model is that in a very small group of scenarios it indicates
    #three goal scoring values as 1. This is a way to fix it. 
    #In the future I would make improvement here as indicated in my report  
    if(np.count_nonzero(pred[i]) not in [1,2]):
        pred[i][np.random.choice(np.nonzero(pred[i])[0])] = 0

    if(np.count_nonzero(pred[i]) not in [1,2]):
        print("Pred:", i, pred[i])

    #Perform the Scoring
    temp = score_func(pred[i], y_test[i])
    score_dict[temp] +=1
    score += temp

    temp = score_func(benchmark[i], y_test[i])
    benchmark_dict[temp] +=1
    benchmark_score += temp


print("----------------")
print("Total Score")
print("----------------")
print("SGD:  ", score)
print("Benchmark: ", benchmark_score)
print("----------------")
print("Stats: ")
for k in score_dict.keys():
    print("SGD:  Matches where", k, "points awarded: ", score_dict[k])
    print("Benchmark: Matches where", k, "points awarded: ", benchmark_dict[k], "\n")

1482 1482 1482 1482 3554
----------------
Total Score
----------------
SGD:   3044
Benchmark:  4559
----------------
Stats: 
SGD:  Matches where 0 points awarded:  319
Benchmark: Matches where 0 points awarded:  146 

SGD:  Matches where 2 points awarded:  1027
Benchmark: Matches where 2 points awarded:  1082 

SGD:  Matches where 5 points awarded:  74
Benchmark: Matches where 5 points awarded:  29 

SGD:  Matches where 10 points awarded:  62
Benchmark: Matches where 10 points awarded:  225 



### Conclusion

The SGD was created and saved successfully. It seems like it really pales in comparison to the benchmark, as it was outperformed in all categories except the correct number of total goals metric (matches where 5 points were awarded). Overall it is 1515 points behind the benchmark or 66.76% of the benchmark score. Ideally the NN will assist in rectifying this.