In [81]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [82]:
df_modelling = pd.read_csv("featured data", index_col = [0])
df_modelling.tail(5)

Unnamed: 0,match_id,year,round,division,points_local,wins_local,draws_local,losses_local,gf_local,ga_local,...,Reus Deportiu_visitor,Sabadell_visitor,Sevilla_visitor,Sevilla At._visitor,Tenerife_visitor,UCAM Murcia_visitor,UD Logroñés_visitor,Valencia_visitor,Villarreal_visitor,match_winner
3714,104795,2019,26,2,27,7.0,6.0,12.0,22,32,...,0,0,0,0,0,0,0,0,0,2
3715,104794,2019,26,2,34,7.0,13.0,5.0,30,27,...,0,0,0,0,0,0,0,0,0,0
3716,104805,2019,27,2,47,13.0,8.0,5.0,32,18,...,1,0,0,0,0,0,0,0,0,0
3717,104814,2019,27,2,27,5.0,12.0,9.0,20,29,...,0,0,0,0,0,0,0,0,0,1
3718,104798,2019,27,2,49,13.0,10.0,3.0,36,21,...,0,0,0,0,0,0,0,0,0,1


## Modelling

Once the dataset has been featurised, we model with different classifiers and analyse the results obtained for each of them. 

### Logistic Regression

In [83]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = LogisticRegression(max_iter = 500)
model.fit(X_train_scaled, y_train)


X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.489247311827957

In [84]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.54      0.77      0.63       346
        Empate       0.39      0.17      0.24       224
Gana Visitante       0.40      0.34      0.37       174

      accuracy                           0.49       744
     macro avg       0.44      0.43      0.41       744
  weighted avg       0.46      0.49      0.45       744



In [85]:
confusion_matrix(y_test, y_pred)

array([[266,  32,  48],
       [143,  38,  43],
       [ 87,  27,  60]], dtype=int64)

In [86]:
prediction_LR = pd.DataFrame([y_pred, y_test]).T
prediction_LR

Unnamed: 0,0,1
0,0.0,0.0
1,2.0,2.0
2,1.0,2.0
3,0.0,0.0
4,0.0,1.0
...,...,...
739,1.0,1.0
740,0.0,1.0
741,1.0,2.0
742,0.0,1.0


In [87]:
prediction_LR = prediction_LR.rename(columns={0:'pred', 1:'test'})

In [88]:
def success(col):
    if col[0] == col[1]:
        return 1
    else:
        return 0

In [89]:
prediction_LR['success'] = prediction_LR.apply(lambda col: success (col),axis=1)
prediction_LR.tail()

Unnamed: 0,pred,test,success
739,1.0,1.0,1
740,0.0,1.0,0
741,1.0,2.0,0
742,0.0,1.0,0
743,0.0,2.0,0


In [90]:
prediction_LR.to_excel('LogisticRegression prediction.xls', )

### Decision Tree

In [91]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.3803763440860215

In [92]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.50      0.47      0.48       346
        Empate       0.29      0.28      0.29       224
Gana Visitante       0.29      0.33      0.31       174

      accuracy                           0.38       744
     macro avg       0.36      0.36      0.36       744
  weighted avg       0.39      0.38      0.38       744



In [93]:
confusion_matrix(y_test, y_pred)

array([[162, 103,  81],
       [100,  63,  61],
       [ 64,  52,  58]], dtype=int64)

In [94]:
prediction_DT = pd.DataFrame([y_pred, y_test]).T
prediction_DT

Unnamed: 0,0,1
0,0.0,0.0
1,1.0,2.0
2,0.0,2.0
3,0.0,0.0
4,1.0,1.0
...,...,...
739,0.0,1.0
740,0.0,1.0
741,0.0,2.0
742,0.0,1.0


In [95]:
prediction_DT = prediction_LR.rename(columns={0:'pred', 1:'test'})

In [96]:
prediction_DT['success'] = prediction_DT.apply(lambda col: success (col),axis=1)
prediction_DT.tail()

Unnamed: 0,pred,test,success
739,1.0,1.0,1
740,0.0,1.0,0
741,1.0,2.0,0
742,0.0,1.0,0
743,0.0,2.0,0


In [97]:
prediction_DT.to_excel('DecissionTree prediction.xls')

### Random Forest

In [98]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4717741935483871

In [99]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.51      0.81      0.63       346
        Empate       0.33      0.13      0.19       224
Gana Visitante       0.39      0.24      0.30       174

      accuracy                           0.47       744
     macro avg       0.41      0.39      0.37       744
  weighted avg       0.43      0.47      0.42       744



In [100]:
confusion_matrix(y_test, y_pred)

array([[280,  38,  28],
       [157,  29,  38],
       [110,  22,  42]], dtype=int64)

In [101]:
prediction_RF = pd.DataFrame([y_pred, y_test]).T
prediction_RF

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,2.0
2,0.0,2.0
3,0.0,0.0
4,0.0,1.0
...,...,...
739,0.0,1.0
740,0.0,1.0
741,0.0,2.0
742,0.0,1.0


In [102]:
prediction_RF = prediction_RF.rename(columns={0:'pred', 1:'test'})

In [103]:
prediction_RF['success'] = prediction_RF.apply(lambda col: success (col),axis=1)
prediction_RF.tail()

Unnamed: 0,pred,test,success
739,0.0,1.0,0
740,0.0,1.0,0
741,0.0,2.0,0
742,0.0,1.0,0
743,0.0,2.0,0


In [104]:
prediction_RF.to_excel('RandomForest prediction.xls')

### KNeighbors

In [105]:
from sklearn.neighbors import KNeighborsClassifier
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model =KNeighborsClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.43010752688172044

In [106]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.49      0.66      0.56       346
        Empate       0.30      0.22      0.26       224
Gana Visitante       0.37      0.24      0.29       174

      accuracy                           0.43       744
     macro avg       0.39      0.37      0.37       744
  weighted avg       0.41      0.43      0.41       744



In [107]:
confusion_matrix(y_test, y_pred)

array([[228,  78,  40],
       [143,  50,  31],
       [ 95,  37,  42]], dtype=int64)

In [108]:
prediction_KN = pd.DataFrame([y_pred, y_test]).T
prediction_KN

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,2.0
2,0.0,2.0
3,1.0,0.0
4,1.0,1.0
...,...,...
739,1.0,1.0
740,0.0,1.0
741,0.0,2.0
742,0.0,1.0


In [109]:
prediction_KN = prediction_KN.rename(columns={0:'pred', 1:'test'})

In [110]:
prediction_KN['success'] = prediction_KN.apply(lambda col: success (col),axis=1)
prediction_KN.tail()

Unnamed: 0,pred,test,success
739,1.0,1.0,1
740,0.0,1.0,0
741,0.0,2.0,0
742,0.0,1.0,0
743,0.0,2.0,0


In [111]:
prediction_KN.to_excel('KNeighbors prediction.xls')

### SVC

In [112]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = SVC()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.489247311827957

In [113]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.50      0.92      0.65       346
        Empate       0.39      0.03      0.06       224
Gana Visitante       0.46      0.21      0.29       174

      accuracy                           0.49       744
     macro avg       0.45      0.39      0.33       744
  weighted avg       0.46      0.49      0.39       744



In [114]:
confusion_matrix(y_test, y_pred)

array([[320,   7,  19],
       [193,   7,  24],
       [133,   4,  37]], dtype=int64)

In [115]:
prediction_SVC = pd.DataFrame([y_pred, y_test]).T
prediction_SVC

Unnamed: 0,0,1
0,0.0,0.0
1,2.0,2.0
2,0.0,2.0
3,0.0,0.0
4,0.0,1.0
...,...,...
739,0.0,1.0
740,0.0,1.0
741,0.0,2.0
742,0.0,1.0


In [116]:
prediction_SVC = prediction_SVC.rename(columns={0:'pred', 1:'test'})

In [117]:
prediction_SVC['success'] = prediction_SVC.apply(lambda col: success (col),axis=1)
prediction_SVC.tail()

Unnamed: 0,pred,test,success
739,0.0,1.0,0
740,0.0,1.0,0
741,0.0,2.0,0
742,0.0,1.0,0
743,0.0,2.0,0


In [118]:
prediction_SVC.to_excel('SVC prediction.xls')

In [119]:
prediction_SVC['pred'].unique()

array([0., 2., 1.])

### Gaussian Proccess Classifier

In [120]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model =GaussianProcessClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4852150537634409

In [121]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.50      0.90      0.64       346
        Empate       0.44      0.05      0.09       224
Gana Visitante       0.42      0.22      0.29       174

      accuracy                           0.49       744
     macro avg       0.45      0.39      0.34       744
  weighted avg       0.46      0.49      0.39       744



In [122]:
confusion_matrix(y_test, y_pred)

array([[312,  11,  23],
       [184,  11,  29],
       [133,   3,  38]], dtype=int64)

In [123]:
prediction_GPC = pd.DataFrame([y_pred, y_test]).T
prediction_GPC

Unnamed: 0,0,1
0,0.0,0.0
1,2.0,2.0
2,0.0,2.0
3,0.0,0.0
4,0.0,1.0
...,...,...
739,0.0,1.0
740,0.0,1.0
741,0.0,2.0
742,0.0,1.0


In [124]:
prediction_GPC = prediction_GPC.rename(columns={0:'pred', 1:'test'})

In [125]:
prediction_GPC['success'] = prediction_GPC.apply(lambda col: success (col),axis=1)
prediction_GPC.tail()

Unnamed: 0,pred,test,success
739,0.0,1.0,0
740,0.0,1.0,0
741,0.0,2.0,0
742,0.0,1.0,0
743,0.0,2.0,0


In [126]:
prediction_GPC.to_excel('GPC prediction.xls')

In [127]:
prediction_GPC['pred'].unique()

array([0., 2., 1.])

### MLP Classifier

In [128]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = MLPClassifier(max_iter=1000)
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4260752688172043

In [129]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.54      0.55      0.54       346
        Empate       0.31      0.26      0.28       224
Gana Visitante       0.34      0.40      0.37       174

      accuracy                           0.43       744
     macro avg       0.40      0.40      0.40       744
  weighted avg       0.42      0.43      0.42       744



In [130]:
confusion_matrix(y_test, y_pred)

array([[190,  84,  72],
       [103,  58,  63],
       [ 62,  43,  69]], dtype=int64)

In [131]:
prediction_MLP = pd.DataFrame([y_pred, y_test]).T
prediction_MLP

Unnamed: 0,0,1
0,0.0,0.0
1,2.0,2.0
2,1.0,2.0
3,0.0,0.0
4,1.0,1.0
...,...,...
739,2.0,1.0
740,1.0,1.0
741,1.0,2.0
742,2.0,1.0


In [132]:
prediction_MLP = prediction_MLP.rename(columns={0:'pred', 1:'test'})

In [133]:
prediction_MLP['success'] = prediction_MLP.apply(lambda col: success (col),axis=1)
prediction_MLP.tail()

Unnamed: 0,pred,test,success
739,2.0,1.0,0
740,1.0,1.0,1
741,1.0,2.0,0
742,2.0,1.0,0
743,0.0,2.0,0


In [134]:
prediction_MLP.to_excel('MLP prediction.xls')

### AdaBoost Classifier

In [135]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = AdaBoostClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4731182795698925

In [136]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.51      0.78      0.62       346
        Empate       0.36      0.11      0.17       224
Gana Visitante       0.39      0.33      0.35       174

      accuracy                           0.47       744
     macro avg       0.42      0.41      0.38       744
  weighted avg       0.44      0.47      0.42       744



In [137]:
confusion_matrix(y_test, y_pred)

array([[270,  29,  47],
       [155,  25,  44],
       [101,  16,  57]], dtype=int64)

In [138]:
prediction_ADA = pd.DataFrame([y_pred, y_test]).T
prediction_ADA

Unnamed: 0,0,1
0,0.0,0.0
1,2.0,2.0
2,0.0,2.0
3,0.0,0.0
4,1.0,1.0
...,...,...
739,0.0,1.0
740,0.0,1.0
741,0.0,2.0
742,2.0,1.0


In [139]:
prediction_ADA = prediction_ADA.rename(columns={0:'pred', 1:'test'})

In [140]:
prediction_ADA['success'] = prediction_ADA.apply(lambda col: success (col),axis=1)
prediction_ADA.tail()

Unnamed: 0,pred,test,success
739,0.0,1.0,0
740,0.0,1.0,0
741,0.0,2.0,0
742,2.0,1.0,0
743,0.0,2.0,0


In [141]:
prediction_ADA.to_excel('ADA prediction.xls')

###  GaussianNB 

In [142]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = GaussianNB()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.34946236559139787

In [143]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.64      0.26      0.37       346
        Empate       0.34      0.14      0.20       224
Gana Visitante       0.27      0.79      0.40       174

      accuracy                           0.35       744
     macro avg       0.42      0.40      0.32       744
  weighted avg       0.46      0.35      0.33       744



In [144]:
confusion_matrix(y_test, y_pred)

array([[ 91,  41, 214],
       [ 34,  31, 159],
       [ 17,  19, 138]], dtype=int64)

In [145]:
prediction_GNB = pd.DataFrame([y_pred, y_test]).T
prediction_GNB

Unnamed: 0,0,1
0,0.0,0.0
1,2.0,2.0
2,2.0,2.0
3,2.0,0.0
4,2.0,1.0
...,...,...
739,1.0,1.0
740,2.0,1.0
741,2.0,2.0
742,2.0,1.0


In [146]:
prediction_GNB = prediction_GNB.rename(columns={0:'pred', 1:'test'})

In [147]:
prediction_GNB['success'] = prediction_GNB.apply(lambda col: success (col),axis=1)
prediction_GNB.tail()

Unnamed: 0,pred,test,success
739,1.0,1.0,1
740,2.0,1.0,0
741,2.0,2.0,1
742,2.0,1.0,0
743,0.0,2.0,0


In [148]:
prediction_GNB.to_excel('GNB prediction.xls')

###  Quadratic Discriminant Analysis 

In [149]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = QuadraticDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)



0.34543010752688175

In [150]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.56      0.22      0.31       346
        Empate       0.30      0.40      0.34       224
Gana Visitante       0.29      0.53      0.38       174

      accuracy                           0.35       744
     macro avg       0.39      0.38      0.35       744
  weighted avg       0.42      0.35      0.34       744



In [151]:
confusion_matrix(y_test, y_pred)

array([[ 75, 148, 123],
       [ 37,  90,  97],
       [ 22,  60,  92]], dtype=int64)

In [152]:
prediction_QDA = pd.DataFrame([y_pred, y_test]).T
prediction_QDA

Unnamed: 0,0,1
0,0.0,0.0
1,2.0,2.0
2,0.0,2.0
3,2.0,0.0
4,2.0,1.0
...,...,...
739,1.0,1.0
740,2.0,1.0
741,2.0,2.0
742,0.0,1.0


In [153]:
prediction_QDA = prediction_QDA.rename(columns={0:'pred', 1:'test'})

In [154]:
prediction_QDA['success'] = prediction_QDA.apply(lambda col: success (col),axis=1)
prediction_QDA.tail()

Unnamed: 0,pred,test,success
739,1.0,1.0,1
740,2.0,1.0,0
741,2.0,2.0,1
742,0.0,1.0,0
743,0.0,2.0,0


In [155]:
prediction_QDA.to_excel('QDA prediction.xls')