In [1]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [2]:
data = pd.read_csv('plotting_data', index_col = [0])
data.tail(5)

Unnamed: 0,year,round,division,local_goals,visitor_goals,points_local,wins_local,draws_local,losses_local,gf_local,...,pos_local,points_visitor,wins_visitor,draws_visitor,losses_visitor,gf_visitor,ga_visitor,avg_visitor,pos_visitor,match_winner
4783,2021,38,2,2,1,44,9.0,17.0,11.0,38,...,14,40,9.0,13.0,15.0,35,42,-7.0,19,0
4784,2021,38,2,1,0,39,9.0,12.0,16.0,36,...,20,41,11.0,8.0,18.0,35,43,-8.0,18,0
4785,2021,38,2,1,4,41,10.0,11.0,16.0,25,...,17,58,16.0,10.0,11.0,39,34,5.0,6,2
4786,2021,38,2,1,1,57,16.0,9.0,12.0,43,...,7,62,18.0,8.0,11.0,41,31,10.0,4,1
4787,2021,38,2,1,0,58,15.0,13.0,9.0,35,...,5,37,8.0,13.0,16.0,32,49,-17.0,21,0


In [3]:
data = data.drop(['local_goals', 'visitor_goals'], axis=1)

## Modelling non-featured

Since the results obtained with the classifiers for the featurised dataset do not improve the results obtained before the feature engineering, we now test the classifiers that were not tested with the unfeatured dataframe.

### SVC

In [4]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = SVC()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4780278670953912

In [5]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.48      0.92      0.63       419
        Draw       0.29      0.02      0.03       270
 Visitor win       0.52      0.22      0.31       244

    accuracy                           0.48       933
   macro avg       0.43      0.39      0.32       933
weighted avg       0.43      0.48      0.37       933



In [6]:
confusion_matrix(y_test, y_pred)

array([[387,   6,  26],
       [241,   5,  24],
       [184,   6,  54]], dtype=int64)

In [7]:
prediction_SVC = pd.DataFrame([y_pred, y_test]).T
prediction_SVC

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,1.0,2.0


In [8]:
prediction_SVC = prediction_SVC.rename(columns={0:'pred', 1:'test'})

In [9]:
def success(col):
    if col[0] == col[1]:
        return 1
    else:
        return 0

In [10]:
prediction_SVC['success'] = prediction_SVC.apply(lambda col: success (col),axis=1)
prediction_SVC.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,1.0,2.0,0
932,0.0,0.0,1


### Gaussian Proccess Classifier

In [11]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model =GaussianProcessClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.48017148981779206

In [12]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.49      0.87      0.62       419
        Draw       0.40      0.06      0.11       270
 Visitor win       0.46      0.28      0.35       244

    accuracy                           0.48       933
   macro avg       0.45      0.40      0.36       933
weighted avg       0.46      0.48      0.40       933



In [13]:
confusion_matrix(y_test, y_pred)

array([[363,  15,  41],
       [214,  17,  39],
       [166,  10,  68]], dtype=int64)

In [14]:
prediction_GPC = pd.DataFrame([y_pred, y_test]).T
prediction_GPC

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,2.0,2.0


In [15]:
prediction_GPC = prediction_GPC.rename(columns={0:'pred', 1:'test'})

In [16]:
prediction_GPC['success'] = prediction_GPC.apply(lambda col: success (col),axis=1)
prediction_GPC.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,2.0,2.0,1
932,0.0,0.0,1


### MLP Classifier

In [17]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = MLPClassifier(max_iter=1000)
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.46730975348338694

In [18]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.50      0.77      0.60       419
        Draw       0.30      0.11      0.17       270
 Visitor win       0.47      0.33      0.39       244

    accuracy                           0.47       933
   macro avg       0.42      0.41      0.39       933
weighted avg       0.43      0.47      0.42       933



In [19]:
confusion_matrix(y_test, y_pred)

array([[324,  48,  47],
       [193,  31,  46],
       [137,  26,  81]], dtype=int64)

In [20]:
prediction_MLP = pd.DataFrame([y_pred, y_test]).T
prediction_MLP

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,1.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,2.0,2.0


In [21]:
prediction_MLP = prediction_MLP.rename(columns={0:'pred', 1:'test'})

In [22]:
prediction_MLP['success'] = prediction_MLP.apply(lambda col: success (col),axis=1)
prediction_MLP.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,2.0,2.0,1
932,0.0,0.0,1


### AdaBoost Classifier

In [23]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = AdaBoostClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4780278670953912

In [24]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.49      0.85      0.62       419
        Draw       0.28      0.06      0.09       270
 Visitor win       0.48      0.31      0.38       244

    accuracy                           0.48       933
   macro avg       0.42      0.40      0.36       933
weighted avg       0.43      0.48      0.41       933



In [25]:
confusion_matrix(y_test, y_pred)

array([[355,  24,  40],
       [212,  15,  43],
       [153,  15,  76]], dtype=int64)

In [26]:
prediction_ADA = pd.DataFrame([y_pred, y_test]).T
prediction_ADA

Unnamed: 0,0,1
0,0.0,0.0
1,1.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,2.0,2.0


In [27]:
prediction_ADA = prediction_ADA.rename(columns={0:'pred', 1:'test'})

In [28]:
prediction_ADA['success'] = prediction_ADA.apply(lambda col: success (col),axis=1)
prediction_ADA.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,2.0,2.0,1
932,0.0,0.0,1


###  GaussianNB 

In [29]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = GaussianNB()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4362272240085745

In [30]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.55      0.49      0.52       419
        Draw       0.32      0.44      0.37       270
 Visitor win       0.44      0.34      0.38       244

    accuracy                           0.44       933
   macro avg       0.44      0.42      0.42       933
weighted avg       0.46      0.44      0.44       933



In [31]:
confusion_matrix(y_test, y_pred)

array([[204, 159,  56],
       [100, 120,  50],
       [ 64,  97,  83]], dtype=int64)

In [32]:
prediction_GNB = pd.DataFrame([y_pred, y_test]).T
prediction_GNB

Unnamed: 0,0,1
0,2.0,0.0
1,1.0,1.0
2,1.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,1.0,0.0
931,2.0,2.0


In [33]:
prediction_GNB = prediction_GNB.rename(columns={0:'pred', 1:'test'})

In [34]:
prediction_GNB['success'] = prediction_GNB.apply(lambda col: success (col),axis=1)
prediction_GNB.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,1.0,0.0,0
931,2.0,2.0,1
932,0.0,0.0,1


###  Quadratic Discriminant Analysis 

In [35]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = QuadraticDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)



0.45766345123258306

In [36]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.46      0.93      0.62       419
        Draw       0.20      0.01      0.03       270
 Visitor win       0.47      0.13      0.21       244

    accuracy                           0.46       933
   macro avg       0.38      0.36      0.28       933
weighted avg       0.39      0.46      0.34       933



In [37]:
confusion_matrix(y_test, y_pred)

array([[391,   7,  21],
       [251,   4,  15],
       [203,   9,  32]], dtype=int64)

In [38]:
prediction_QDA = pd.DataFrame([y_pred, y_test]).T
prediction_QDA

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,0.0,2.0


In [39]:
prediction_QDA = prediction_QDA.rename(columns={0:'pred', 1:'test'})

In [40]:
prediction_QDA['success'] = prediction_QDA.apply(lambda col: success (col),axis=1)
prediction_QDA.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,0.0,2.0,0
932,0.0,0.0,1


## Final Results 

In these models the same happens as in the previous notebook, the results with the featurised dataframe get worse.

The possible explanations for this result can be, on the one hand, a possible overfitting problem. The model may incur in this overfitting problem due to an excess of  information that is too concrete, without having too large a volume of data and therefore learning too much detail. 
On the other hand, an error may have occurred when featurising, but this section has been reviewed and the results are logical and there is no evidence of any error having been made.
The conclusion is that noise has most likely been introduced into the system with the featuring and that is why the percentages are slightly lower.

Even if it is strange, it is logical, why? Before featurising, the model that returned a better result was the simplest one, logistic regression, so it is consistent with this that when making more sophisticated models with more information, the results may be worse. 

The best results obtained will be those corresponding to Logistic Regression without featurising the data and therefore will be the final results of the model.

### Logistic Regression non-featured results

In [41]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = LogisticRegression(max_iter = 500)
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.5074946466809421

In [42]:
confusion_matrix(y_test, y_pred)

array([[191,   5,  22],
       [108,   9,  14],
       [ 79,   2,  37]], dtype=int64)

In [43]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.51      0.88      0.64       218
        Draw       0.56      0.07      0.12       131
 Visitor win       0.51      0.31      0.39       118

    accuracy                           0.51       467
   macro avg       0.52      0.42      0.38       467
weighted avg       0.52      0.51      0.43       467



### Saving predictions for Tableau

In [44]:
prediction_LR = pd.DataFrame([y_pred, y_test]).T
prediction_LR

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
462,0.0,1.0
463,0.0,1.0
464,0.0,2.0
465,0.0,1.0


In [45]:
prediction_LR = prediction_LR.rename(columns={0:'pred', 1:'test'})

In [46]:
prediction_LR['success'] = prediction_LR.apply(lambda col: success (col),axis=1)
prediction_LR.tail()

Unnamed: 0,pred,test,success
462,0.0,1.0,0
463,0.0,1.0,0
464,0.0,2.0,0
465,0.0,1.0,0
466,1.0,1.0,1


In [47]:
recreated_X_test = pd.DataFrame(X_test, columns = data.columns[:-1])
prediction_LR = pd.concat([prediction_LR, recreated_X_test], axis=1)

In [48]:
prediction_LR.head()

Unnamed: 0,pred,test,success,year,round,division,points_local,wins_local,draws_local,losses_local,...,avg_local,pos_local,points_visitor,wins_visitor,draws_visitor,losses_visitor,gf_visitor,ga_visitor,avg_visitor,pos_visitor
0,0.0,0.0,1,2021.0,37.0,1.0,34.0,8.0,10.0,18.0,...,-16.0,16.0,40.0,9.0,13.0,14.0,43.0,53.0,-10.0,13.0
1,0.0,1.0,0,2020.0,6.0,2.0,1.0,0.0,1.0,4.0,...,-5.0,22.0,2.0,0.0,2.0,3.0,3.0,7.0,-4.0,21.0
2,0.0,2.0,0,2018.0,10.0,2.0,9.0,3.0,0.0,6.0,...,-5.0,17.0,9.0,2.0,3.0,4.0,7.0,12.0,-5.0,19.0
3,0.0,0.0,1,2021.0,29.0,1.0,60.0,18.0,6.0,4.0,...,26.0,3.0,23.0,4.0,11.0,13.0,22.0,33.0,-11.0,18.0
4,0.0,0.0,1,2018.0,27.0,1.0,35.0,10.0,5.0,11.0,...,5.0,11.0,20.0,5.0,5.0,16.0,19.0,53.0,-34.0,18.0


In [49]:
prediction_LR.to_excel('LogisticRegression_prediction.xls', )

This model obtains very good results for home and away wins compared to other models, however it sacrifices the accuracy of the draw category, which has proven to be the most difficult to predict for most models due to the difficulty of finding patterns or situations that induce a draw.

The overall accuracy is 51% which is a significant improvement over the initial 33% chance of being correct for the 3 possible categories without applying any model. Reliability, as already mentioned, improves considerably when the prediction of this model is not a draw. 

The results of this model will be used in Tableau, and are saved in an .xls file. The individual values of the classification report will be discussed in more detail in the project report (pdf file) attached to the repository.