In [7]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [8]:
data = pd.read_csv('plotting_data', index_col = [0])
data.tail(5)

Unnamed: 0,year,round,division,local_goals,visitor_goals,points_local,wins_local,draws_local,losses_local,gf_local,...,pos_local,points_visitor,wins_visitor,draws_visitor,losses_visitor,gf_visitor,ga_visitor,avg_visitor,pos_visitor,match_winner
4657,2021,38,2,2,1,44,9.0,17.0,11.0,38,...,14,40,9.0,13.0,15.0,35,42,-7.0,19,0
4658,2021,38,2,1,0,39,9.0,12.0,16.0,36,...,20,41,11.0,8.0,18.0,35,43,-8.0,18,0
4659,2021,38,2,1,4,41,10.0,11.0,16.0,25,...,17,58,16.0,10.0,11.0,39,34,5.0,6,2
4660,2021,38,2,1,1,57,16.0,9.0,12.0,43,...,7,62,18.0,8.0,11.0,41,31,10.0,4,1
4661,2021,38,2,1,0,58,15.0,13.0,9.0,35,...,5,37,8.0,13.0,16.0,32,49,-17.0,21,0


In [9]:
data = data.drop(['local_goals', 'visitor_goals'], axis=1)

## Modelling non-featured

Since the results obtained with the classifiers for the featurised dataset do not improve the results obtained before the feature engineering, we now test the classifiers that were not tested with the unfeatured dataframe.

### SVC

In [10]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = SVC()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4780278670953912

In [11]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.48      0.92      0.63       419
        Draw       0.29      0.02      0.03       270
 Visitor win       0.52      0.22      0.31       244

    accuracy                           0.48       933
   macro avg       0.43      0.39      0.32       933
weighted avg       0.43      0.48      0.37       933



In [12]:
confusion_matrix(y_test, y_pred)

array([[387,   6,  26],
       [241,   5,  24],
       [184,   6,  54]], dtype=int64)

In [13]:
prediction_SVC = pd.DataFrame([y_pred, y_test]).T
prediction_SVC

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,1.0,2.0


In [14]:
prediction_SVC = prediction_SVC.rename(columns={0:'pred', 1:'test'})

In [17]:
def success(col):
    if col[0] == col[1]:
        return 1
    else:
        return 0

In [18]:
prediction_SVC['success'] = prediction_SVC.apply(lambda col: success (col),axis=1)
prediction_SVC.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,1.0,2.0,0
932,0.0,0.0,1


In [21]:
prediction_SVC.to_excel('SVC prediction.xls')

In [22]:
prediction_SVC['pred'].unique()

array([0., 2., 1.])

### Gaussian Proccess Classifier

In [23]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model =GaussianProcessClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.48017148981779206

In [24]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.49      0.87      0.62       419
        Draw       0.40      0.06      0.11       270
 Visitor win       0.46      0.28      0.35       244

    accuracy                           0.48       933
   macro avg       0.45      0.40      0.36       933
weighted avg       0.46      0.48      0.40       933



In [25]:
confusion_matrix(y_test, y_pred)

array([[363,  15,  41],
       [214,  17,  39],
       [166,  10,  68]], dtype=int64)

In [26]:
prediction_GPC = pd.DataFrame([y_pred, y_test]).T
prediction_GPC

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,2.0,2.0


In [27]:
prediction_GPC = prediction_GPC.rename(columns={0:'pred', 1:'test'})

In [28]:
prediction_GPC['success'] = prediction_GPC.apply(lambda col: success (col),axis=1)
prediction_GPC.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,2.0,2.0,1
932,0.0,0.0,1


In [29]:
prediction_GPC.to_excel('GPC prediction.xls')

In [30]:
prediction_GPC['pred'].unique()

array([0., 2., 1.])

### MLP Classifier

In [31]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = MLPClassifier(max_iter=1000)
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.46730975348338694

In [32]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.49      0.84      0.62       419
        Draw       0.26      0.05      0.08       270
 Visitor win       0.44      0.29      0.35       244

    accuracy                           0.47       933
   macro avg       0.40      0.39      0.35       933
weighted avg       0.41      0.47      0.39       933



In [33]:
confusion_matrix(y_test, y_pred)

array([[352,  20,  47],
       [214,  13,  43],
       [156,  17,  71]], dtype=int64)

In [34]:
prediction_MLP = pd.DataFrame([y_pred, y_test]).T
prediction_MLP

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,2.0,2.0


In [35]:
prediction_MLP = prediction_MLP.rename(columns={0:'pred', 1:'test'})

In [36]:
prediction_MLP['success'] = prediction_MLP.apply(lambda col: success (col),axis=1)
prediction_MLP.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,2.0,2.0,1
932,0.0,0.0,1


In [37]:
prediction_MLP.to_excel('MLP prediction.xls')

### AdaBoost Classifier

In [38]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = AdaBoostClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4780278670953912

In [39]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.49      0.85      0.62       419
        Draw       0.28      0.06      0.09       270
 Visitor win       0.48      0.31      0.38       244

    accuracy                           0.48       933
   macro avg       0.42      0.40      0.36       933
weighted avg       0.43      0.48      0.41       933



In [40]:
confusion_matrix(y_test, y_pred)

array([[355,  24,  40],
       [212,  15,  43],
       [153,  15,  76]], dtype=int64)

In [41]:
prediction_ADA = pd.DataFrame([y_pred, y_test]).T
prediction_ADA

Unnamed: 0,0,1
0,0.0,0.0
1,1.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,2.0,2.0


In [42]:
prediction_ADA = prediction_ADA.rename(columns={0:'pred', 1:'test'})

In [43]:
prediction_ADA['success'] = prediction_ADA.apply(lambda col: success (col),axis=1)
prediction_ADA.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,2.0,2.0,1
932,0.0,0.0,1


In [44]:
prediction_ADA.to_excel('ADA prediction.xls')

###  GaussianNB 

In [45]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = GaussianNB()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4362272240085745

In [46]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.55      0.49      0.52       419
        Draw       0.32      0.44      0.37       270
 Visitor win       0.44      0.34      0.38       244

    accuracy                           0.44       933
   macro avg       0.44      0.42      0.42       933
weighted avg       0.46      0.44      0.44       933



In [47]:
confusion_matrix(y_test, y_pred)

array([[204, 159,  56],
       [100, 120,  50],
       [ 64,  97,  83]], dtype=int64)

In [48]:
prediction_GNB = pd.DataFrame([y_pred, y_test]).T
prediction_GNB

Unnamed: 0,0,1
0,2.0,0.0
1,1.0,1.0
2,1.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,1.0,0.0
931,2.0,2.0


In [49]:
prediction_GNB = prediction_GNB.rename(columns={0:'pred', 1:'test'})

In [50]:
prediction_GNB['success'] = prediction_GNB.apply(lambda col: success (col),axis=1)
prediction_GNB.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,1.0,0.0,0
931,2.0,2.0,1
932,0.0,0.0,1


In [51]:
prediction_GNB.to_excel('GNB prediction.xls')

###  Quadratic Discriminant Analysis 

In [52]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = QuadraticDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)



0.45766345123258306

In [53]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.46      0.93      0.62       419
        Draw       0.20      0.01      0.03       270
 Visitor win       0.47      0.13      0.21       244

    accuracy                           0.46       933
   macro avg       0.38      0.36      0.28       933
weighted avg       0.39      0.46      0.34       933



In [54]:
confusion_matrix(y_test, y_pred)

array([[391,   7,  21],
       [251,   4,  15],
       [203,   9,  32]], dtype=int64)

In [55]:
prediction_QDA = pd.DataFrame([y_pred, y_test]).T
prediction_QDA

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,0.0,2.0


In [56]:
prediction_QDA = prediction_QDA.rename(columns={0:'pred', 1:'test'})

In [57]:
prediction_QDA['success'] = prediction_QDA.apply(lambda col: success (col),axis=1)
prediction_QDA.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,0.0,2.0,0
932,0.0,0.0,1


In [58]:
prediction_QDA.to_excel('QDA prediction.xls')

We  create a dataframe which to collect all this results in order to be used in Tableau during the front-end. 

Surprisingly, we find that all models improve their results with the non-featured dataset. 

The possible explanations for this result can be, on the one hand, a possible overfitting problem. The model may incur in this overfitting problem due to an excess of party information that is too concrete, without having too large a volume of data and therefore learning too much detail. 
On the other hand, an error may have occurred when featurising, but this section has been reviewed and the results are logical and there is no evidence of any error having been made.

Even if it is strange, it is logical, why? Before featurising, the model that returned a better result was the simplest one, logistic regression, so it is consistent with this that when making more sophisticated models with more information, the results may be worse. 