In [3]:
import numpy as np
import sklearn
import time
import random

# random forest
from sklearn.ensemble import RandomForestClassifier

#Calculate accuracy
from sklearn import metrics

#KNN
from sklearn.neighbors import KNeighborsClassifier

#AdaBoost
from sklearn.ensemble import AdaBoostClassifier

#SVM
from sklearn import svm

#Neural Nets
from sklearn.neural_network import MLPClassifier

#Naive Bayes
from sklearn.naive_bayes import GaussianNB

import pandas as pd

In [4]:
###############################################################################
# We have to use and MODIFY a lot the pre-processing method borrowed from another library to maximum our use of the dataset
# because it is a huge raw dataset and it is not in the form we need for our project. 
# https://github.com/llSourcell/Predicting_Winning_Teams
# Except for loading the data, everything else is orginal
# There will be a clear mark where our pure original code starts

In [5]:
# Read data and drop redundant column.
data = pd.read_csv('Egfinal_dataset.csv')
data = data.filter(['FTR','HTP','ATP','HM1','HM2','HM3','AM1','AM2','AM3','HTGD','ATGD','DiffFormPts','DiffLP'], axis=1)

# Preview data.
display(data.head())


#Full Time Result (H=Home Win, D=Draw, A=Away Win)
#HTGD - Home team goal difference
#ATGD - away team goal difference
#HTP - Home team points
#ATP - Away team points
#DiffFormPts Diff in points
#DiffLP - Differnece in last years prediction

#Input - 12 other features (fouls, shots, goals, misses,corners, red card, yellow cards)
#Output - Full Time Result (H=Home Win, NH = Away win) 

Unnamed: 0,FTR,HTP,ATP,HM1,HM2,HM3,AM1,AM2,AM3,HTGD,ATGD,DiffFormPts,DiffLP
0,H,0.0,0.0,M,M,M,M,M,M,0.0,0.0,0.0,0.0
1,H,0.0,0.0,M,M,M,M,M,M,0.0,0.0,0.0,-4.0
2,NH,0.0,0.0,M,M,M,M,M,M,0.0,0.0,0.0,2.0
3,NH,0.0,0.0,M,M,M,M,M,M,0.0,0.0,0.0,1.0
4,H,0.0,0.0,M,M,M,M,M,M,0.0,0.0,0.0,-10.0


In [6]:
#what is the win rate for the home team?

# Total number of matches.
n_matches = data.shape[0]

# Calculate number of features. -1 because we are saving one as the target variable (win/lose/draw)
n_features = data.shape[1] - 1

# Calculate matches won by home team.
n_homewins = len(data[data.FTR == 'H'])

# Calculate win rate for home team.
win_rate = (float(n_homewins) / (n_matches)) * 100

# Print the results
print("Total number of matches: {}".format(n_matches))
print("Number of features: {}".format(n_features))
print("Number of matches won by home team: {}".format(n_homewins))
print("Win rate of home team: {:.2f}%".format(win_rate))

Total number of matches: 6080
Number of features: 12
Number of matches won by home team: 2816
Win rate of home team: 46.32%


In [7]:
# Visualising distribution of data
from pandas.tools.plotting import scatter_matrix

#the scatter matrix is plotting each of the columns specified against each other column.
#You would have observed that the diagonal graph is defined as a histogram, which means that in the 
#section of the plot matrix where the variable is against itself, a histogram is plotted.

#Scatter plots show how much one variable is affected by another. 
#The relationship between two variables is called their correlation
#negative vs positive correlation

#HTGD - Home team goal difference
#ATGD - away team goal difference
#HTP - Home team points
#ATP - Away team points
#DiffFormPts Diff in points
#DiffLP - Differnece in last years prediction

scatter_matrix(data[['HTGD','ATGD','HTP','ATP','DiffFormPts','DiffLP']], figsize=(10,10))



array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000244C0024438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000244C0348710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000244C0374BA8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000244C03A6278>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000244C03CE908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000244C03CE940>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000244C0424668>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000244C044ECF8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000244C047F3C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000244C04A3A58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000244C04D7128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000244C04FF7B8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000

In [8]:
# Separate into feature set and target variable
#FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)
X_all = data.drop(['FTR'],1)
y_all = data['FTR']

# Standardising the data.
from sklearn.preprocessing import scale

#Center to the mean and component wise scale to unit variance.
cols = [['HTGD','ATGD','HTP','ATP','DiffLP']]
for col in cols:
    X_all[col] = scale(X_all[col])

In [9]:
#last 3 wins for both sides
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')

#we want continous vars that are integers for our input data, so lets remove any categorical vars
def preprocess_features(X):
    ''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix = col)
                    
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))

Processed feature columns (30 total features):
['HTP', 'ATP', 'HM1_D', 'HM1_L', 'HM1_M', 'HM1_W', 'HM2_D', 'HM2_L', 'HM2_M', 'HM2_W', 'HM3_D', 'HM3_L', 'HM3_M', 'HM3_W', 'AM1_D', 'AM1_L', 'AM1_M', 'AM1_W', 'AM2_D', 'AM2_L', 'AM2_M', 'AM2_W', 'AM3_D', 'AM3_L', 'AM3_M', 'AM3_W', 'HTGD', 'ATGD', 'DiffFormPts', 'DiffLP']


In [10]:
############################################################################################################
#Here below is all original code

In [21]:
def AdaBoost_classifier(train_X,train_Y,test_X,test_Y):
    abc = AdaBoostClassifier(n_estimators=50)
    abc.fit(train_X,train_Y)
    trainAccuracy = abc.score(train_X,train_Y)
    pred = abc.predict(test_X)
    accuracy = metrics.accuracy_score(test_Y,pred)
    return accuracy,trainAccuracy
    
def RandomForest_classifier(train_X,train_Y,test_X,test_Y):
    
    clf=RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
    clf.fit(train_X,train_Y)
    trainAccuracy = clf.score(train_X,train_Y)
    y_pred = clf.predict(test_X)
    accuracy = metrics.accuracy_score(test_Y,y_pred)
    return accuracy, trainAccuracy

def SVM_classifier(train_X,train_Y,test_X,test_Y):
    clf = svm.SVC()
    clf.fit(train_X, train_Y)
    trainAccuracy = clf.score(train_X,train_Y)
    y_pred = clf.predict(test_X)
    accuracy = metrics.accuracy_score(test_Y,y_pred)
    return accuracy,trainAccuracy
    
def KNN_classifier(train_X,train_Y,test_X,test_Y):
    
    neigh = KNeighborsClassifier(n_neighbors=2)
    neigh.fit(train_X, train_Y)
    trainAccuracy = neigh.score(train_X,train_Y)
    y_pred = neigh.predict(test_X)
    accuracy = metrics.accuracy_score(test_Y,y_pred)
    return accuracy,trainAccuracy
    
def NeuralNets_classifier(train_X,train_Y,test_X,test_Y):
    
    clf = MLPClassifier(hidden_layer_sizes=(10, 2), learning_rate_init=0.0001,max_iter=200)
    clf.fit(train_X, train_Y)
    trainAccuracy = clf.score(train_X,train_Y)
    y_pred = clf.predict(test_X)
    accuracy = metrics.accuracy_score(test_Y,y_pred)
    return accuracy,trainAccuracy

def NaiveBayes_classifier(train_X,train_Y,test_X,test_Y):
    
    gnb = GaussianNB()
    gnb.fit(train_X, train_Y)
    trainAccuracy = gnb.score(train_X,train_Y)
    y_pred = gnb.predict(test_X)
    accuracy = metrics.accuracy_score(test_Y,y_pred)
    return accuracy,trainAccuracy
    

In [22]:
# Cross Validation # 80% 20%
from sklearn.model_selection import train_test_split

train_X, test_X,train_Y, test_Y = train_test_split(X_all,y_all,test_size=0.2)

accuracy_SVM = []
accuracy_AdaBoost = []
accuracy_RandomForest = []
accuracy_KNN = []
accuracy_NN = []
accuracy_NB = []

train_SVM = []
train_AdaBoost = []
train_RandomForest = []
train_KNN = []
train_NN = []
train_NB = []

for i in range(3):
        
        start_time = time.time()
        
        # AdaBoost Classifier
        testAccuracy1, trainAccuracy1 = AdaBoost_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_AdaBoost.append(testAccuracy1)
        train_AdaBoost.append(trainAccuracy1)

        time1 = time.time()
        if i == 1: 
            print("--- AdaBoost takes %s seconds ---" % (time1 - start_time))
            
            
        # RandomForest Classifier
        testAccuracy2,trainAccuracy2 = RandomForest_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_RandomForest.append(testAccuracy2)
        train_RandomForest.append(trainAccuracy2)
        
        time2 = time.time()
        if i== 1:
            print("--- RandomForest takes %s seconds ---" % (time2 - time1))
        
        # K-Nearest-Neighbour Classifier
        testAccuracy3,trainAccuracy3 = KNN_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_KNN.append(testAccuracy3)
        train_KNN.append(trainAccuracy3)
        
        time3 = time.time()
        if i== 1:
            print("--- KNN takes %s seconds ---" % (time3 - time2))
        
        #SVM classifier
        testAccuracy4, trainAccuracy4 = SVM_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_SVM.append(testAccuracy4)
        train_SVM.append(trainAccuracy4)
        
        time4 = time.time()
        if i==1:
            print("--- SVM takes %s seconds ---" % (time4 - time3))
        
        #Neural Nets classifier
        testAccuracy5,trainAccuracy5 = NeuralNets_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_NN.append(testAccuracy5)
        train_NN.append(trainAccuracy5)
        
        
        time5 = time.time()
        if i==1:
            print("--- Neural Nets takes %s seconds ---" % (time5 - time4))
            
        testAccuracy6,trainAccuracy6 = NaiveBayes_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_NB.append(testAccuracy6)
        train_NB.append(trainAccuracy6)
        
        time6 = time.time()
        if i==1:
            print("--- Naive Bayes takes %s seconds ---" % (time6 - time5))
        
print('TRAINING')        
print("Accuracy for AdaBoost:",np.mean(train_AdaBoost))
print("Accuracy for RandomForest:",np.mean(train_RandomForest))
print("Accuracy for KNN:",np.mean(train_KNN))
print("Accuracy for svm:",np.mean(train_SVM))
print("Accuracy for Neural Nets:",np.mean(train_NN))
print("Accuracy for Naive Bayes:",np.mean(train_NB))


print('TESTING')
print("Accuracy for AdaBoost:",np.mean(accuracy_AdaBoost))
print("Accuracy for RandomForest:",np.mean(accuracy_RandomForest))
print("Accuracy for KNN:",np.mean(accuracy_KNN))
print("Accuracy for svm:",np.mean(accuracy_SVM))
print("Accuracy for Neural Nets:",np.mean(accuracy_NN))
print("Accuracy for Naive Bayes:",np.mean(accuracy_NB))


--- AdaBoost takes 0.8012905120849609 seconds ---
--- RandomForest takes 0.139418363571167 seconds ---
--- KNN takes 1.2809033393859863 seconds ---
--- SVM takes 3.678386688232422 seconds ---
--- Neural Nets takes 2.4955196380615234 seconds ---
--- Naive Bayes takes 0.04683566093444824 seconds ---
TRAINING
Accuracy for AdaBoost: 0.6749588815789473
Accuracy for RandomForest: 0.6504934210526315
Accuracy for KNN: 0.7808388157894738
Accuracy for svm: 0.6823601973684209
Accuracy for Neural Nets: 0.6659128289473685
Accuracy for Naive Bayes: 0.6517269736842105
TESTING
Accuracy for AdaBoost: 0.625
Accuracy for RandomForest: 0.6488486842105263
Accuracy for KNN: 0.5699013157894737
Accuracy for svm: 0.6447368421052632
Accuracy for Neural Nets: 0.6337719298245613
Accuracy for Naive Bayes: 0.6414473684210527


In [13]:
# Cross Validation # 50% 50%
from sklearn.model_selection import train_test_split

train_X, test_X,train_Y, test_Y = train_test_split(X_all,y_all,test_size=0.5)

accuracy_SVM = []
accuracy_AdaBoost = []
accuracy_RandomForest = []
accuracy_KNN = []
accuracy_NN = []
accuracy_NB = []

train_SVM = []
train_AdaBoost = []
train_RandomForest = []
train_KNN = []
train_NN = []
train_NB = []

for i in range(3):
        
        start_time = time.time()
        
        # AdaBoost Classifier
        testAccuracy1, trainAccuracy1 = AdaBoost_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_AdaBoost.append(testAccuracy1)
        train_AdaBoost.append(trainAccuracy1)

        time1 = time.time()
        if i == 1: 
            print("--- AdaBoost takes %s seconds ---" % (time1 - start_time))
            
            
        # RandomForest Classifier
        testAccuracy2,trainAccuracy2 = RandomForest_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_RandomForest.append(testAccuracy2)
        train_RandomForest.append(trainAccuracy2)
        
        time2 = time.time()
        if i== 1:
            print("--- RandomForest takes %s seconds ---" % (time2 - time1))
        
        # K-Nearest-Neighbour Classifier
        testAccuracy3,trainAccuracy3 = KNN_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_KNN.append(testAccuracy3)
        train_KNN.append(trainAccuracy3)
        
        time3 = time.time()
        if i== 1:
            print("--- KNN takes %s seconds ---" % (time3 - time2))
        
        #SVM classifier
        testAccuracy4, trainAccuracy4 = SVM_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_SVM.append(testAccuracy4)
        train_SVM.append(trainAccuracy4)
        
        time4 = time.time()
        if i==1:
            print("--- SVM takes %s seconds ---" % (time4 - time3))
        
        #Neural Nets classifier
        testAccuracy5,trainAccuracy5 = NeuralNets_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_NN.append(testAccuracy5)
        train_NN.append(trainAccuracy5)
        
        
        time5 = time.time()
        if i==1:
            print("--- Neural Nets takes %s seconds ---" % (time5 - time4))
          
        # Naive Bayes
        testAccuracy6,trainAccuracy6 = NaiveBayes_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_NB.append(testAccuracy6)
        train_NB.append(trainAccuracy6)
        
        time6 = time.time()
        if i==1:
            print("--- Naive Bayes takes %s seconds ---" % (time6 - time5))
        
print('TRAINING')        
print("Accuracy for AdaBoost:",np.mean(train_AdaBoost))
print("Accuracy for RandomForest:",np.mean(train_RandomForest))
print("Accuracy for KNN:",np.mean(train_KNN))
print("Accuracy for svm:",np.mean(train_SVM))
print("Accuracy for Neural Nets:",np.mean(train_NN))
print("Accuracy for Naive Bayes:",np.mean(train_NB))


print('TESTING')
print("Accuracy for AdaBoost:",np.mean(accuracy_AdaBoost))
print("Accuracy for RandomForest:",np.mean(accuracy_RandomForest))
print("Accuracy for KNN:",np.mean(accuracy_KNN))
print("Accuracy for svm:",np.mean(accuracy_SVM))
print("Accuracy for Neural Nets:",np.mean(accuracy_NN))
print("Accuracy for Naive Bayes:",np.mean(accuracy_NB))




--- AdaBoost takes 0.7290513515472412 seconds ---
--- RandomForest takes 0.5465385913848877 seconds ---
--- KNN takes 1.3314409255981445 seconds ---
--- SVM takes 2.4534385204315186 seconds ---




--- Neural Nets takes 3.1824893951416016 seconds ---
--- Naive Bayes takes 0.03790092468261719 seconds ---
TRAINING
Accuracy for AdaBoost: 0.6723684210526316
Accuracy for RandomForest: 0.6513157894736842
Accuracy for KNN: 0.78125
Accuracy for svm: 0.6750000000000002
Accuracy for Neural Nets: 0.5508771929824562
Accuracy for Naive Bayes: 0.6371710526315789
TESTING
Accuracy for AdaBoost: 0.6526315789473685
Accuracy for RandomForest: 0.6434210526315789
Accuracy for KNN: 0.5555921052631579
Accuracy for svm: 0.6694078947368421
Accuracy for Neural Nets: 0.550548245614035
Accuracy for Naive Bayes: 0.6378289473684211




In [14]:
# Cross Validation # 20% 80%
from sklearn.model_selection import train_test_split

train_X, test_X,train_Y, test_Y = train_test_split(X_all,y_all,test_size=0.8)

accuracy_SVM = []
accuracy_AdaBoost = []
accuracy_RandomForest = []
accuracy_KNN = []
accuracy_NN = []
accuracy_NB = []

train_SVM = []
train_AdaBoost = []
train_RandomForest = []
train_KNN = []
train_NN = []
train_NB = []

for i in range(3):
        
        start_time = time.time()
        
        # AdaBoost Classifier
        testAccuracy1, trainAccuracy1 = AdaBoost_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_AdaBoost.append(testAccuracy1)
        train_AdaBoost.append(trainAccuracy1)

        time1 = time.time()
        if i == 1: 
            print("--- AdaBoost takes %s seconds ---" % (time1 - start_time))
            
            
        # RandomForest Classifier
        testAccuracy2,trainAccuracy2 = RandomForest_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_RandomForest.append(testAccuracy2)
        train_RandomForest.append(trainAccuracy2)
        
        time2 = time.time()
        if i== 1:
            print("--- RandomForest takes %s seconds ---" % (time2 - time1))
        
        # K-Nearest-Neighbour Classifier
        testAccuracy3,trainAccuracy3 = KNN_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_KNN.append(testAccuracy3)
        train_KNN.append(trainAccuracy3)
        
        time3 = time.time()
        if i== 1:
            print("--- KNN takes %s seconds ---" % (time3 - time2))
        
        #SVM classifier
        testAccuracy4, trainAccuracy4 = SVM_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_SVM.append(testAccuracy4)
        train_SVM.append(trainAccuracy4)
        
        time4 = time.time()
        if i==1:
            print("--- SVM takes %s seconds ---" % (time4 - time3))
        
        #Neural Nets classifier
        testAccuracy5,trainAccuracy5 = NeuralNets_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_NN.append(testAccuracy5)
        train_NN.append(trainAccuracy5)
        
        
        time5 = time.time()
        if i==1:
            print("--- Neural Nets takes %s seconds ---" % (time5 - time4))
        
        # Naive Bayes    
        testAccuracy6,trainAccuracy6 = NaiveBayes_classifier(train_X,train_Y,test_X,test_Y)
        accuracy_NB.append(testAccuracy6)
        train_NB.append(trainAccuracy6)
        
        time6 = time.time()
        if i==1:
            print("--- Naive Bayes takes %s seconds ---" % (time6 - time5))
        
print('TRAINING')        
print("Accuracy for AdaBoost:",np.mean(train_AdaBoost))
print("Accuracy for RandomForest:",np.mean(train_RandomForest))
print("Accuracy for KNN:",np.mean(train_KNN))
print("Accuracy for svm:",np.mean(train_SVM))
print("Accuracy for Neural Nets:",np.mean(train_NN))
print("Accuracy for Naive Bayes:",np.mean(train_NB))


print('TESTING')
print("Accuracy for AdaBoost:",np.mean(accuracy_AdaBoost))
print("Accuracy for RandomForest:",np.mean(accuracy_RandomForest))
print("Accuracy for KNN:",np.mean(accuracy_KNN))
print("Accuracy for svm:",np.mean(accuracy_SVM))
print("Accuracy for Neural Nets:",np.mean(accuracy_NN))
print("Accuracy for Naive Bayes:",np.mean(accuracy_NB))




--- AdaBoost takes 0.29324769973754883 seconds ---
--- RandomForest takes 0.28324198722839355 seconds ---
--- KNN takes 0.49564504623413086 seconds ---
--- SVM takes 0.5674805641174316 seconds ---




--- Neural Nets takes 1.0681450366973877 seconds ---
--- Naive Bayes takes 0.03291177749633789 seconds ---
TRAINING
Accuracy for AdaBoost: 0.6875
Accuracy for RandomForest: 0.6570723684210527
Accuracy for KNN: 0.7771381578947368
Accuracy for svm: 0.6899671052631579
Accuracy for Neural Nets: 0.5550986842105262
Accuracy for Naive Bayes: 0.6554276315789473
TESTING
Accuracy for AdaBoost: 0.6295230263157895
Accuracy for RandomForest: 0.6428865131578947
Accuracy for KNN: 0.5657894736842105
Accuracy for svm: 0.6543996710526315
Accuracy for Neural Nets: 0.551672149122807
Accuracy for Naive Bayes: 0.6435032894736842


