In [1]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [2]:
df_modelling = pd.read_csv("featured data", index_col = [0])


## Modelling

Once the dataset has been featurised, we model with different classifiers and analyse the results obtained for each of them. 

### Logistic Regression

In [3]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = LogisticRegression(max_iter = 500)
model.fit(X_train_scaled, y_train)


X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4769560557341908

In [4]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.50      0.75      0.60       419
        Draw       0.36      0.19      0.25       270
 Visitor win       0.47      0.34      0.39       244

    accuracy                           0.48       933
   macro avg       0.45      0.42      0.41       933
weighted avg       0.45      0.48      0.44       933



In [5]:
confusion_matrix(y_test, y_pred)

array([[313,  57,  49],
       [178,  50,  42],
       [131,  31,  82]], dtype=int64)

In [6]:
prediction_LR = pd.DataFrame([y_pred, y_test]).T
prediction_LR

Unnamed: 0,0,1
0,0.0,0.0
1,1.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,1.0,2.0


In [7]:
prediction_LR = prediction_LR.rename(columns={0:'pred', 1:'test'})

In [8]:
def success(col):
    if col[0] == col[1]:
        return 1
    else:
        return 0

In [9]:
prediction_LR['success'] = prediction_LR.apply(lambda col: success (col),axis=1)
prediction_LR.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,1.0,2.0,0
932,1.0,0.0,0


### Decision Tree

In [10]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.39978563772775993

In [11]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.50      0.48      0.49       419
        Draw       0.32      0.32      0.32       270
 Visitor win       0.33      0.34      0.34       244

    accuracy                           0.40       933
   macro avg       0.38      0.38      0.38       933
weighted avg       0.40      0.40      0.40       933



In [12]:
confusion_matrix(y_test, y_pred)

array([[203, 113, 103],
       [116,  86,  68],
       [ 90,  70,  84]], dtype=int64)

In [13]:
prediction_DT = pd.DataFrame([y_pred, y_test]).T
prediction_DT

Unnamed: 0,0,1
0,1.0,0.0
1,1.0,1.0
2,1.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,1.0,0.0
929,0.0,1.0
930,0.0,0.0
931,2.0,2.0


In [14]:
prediction_DT = prediction_LR.rename(columns={0:'pred', 1:'test'})

In [15]:
prediction_DT['success'] = prediction_DT.apply(lambda col: success (col),axis=1)
prediction_DT.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,1.0,2.0,0
932,1.0,0.0,0


### Random Forest

In [16]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4737406216505895

In [17]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.50      0.76      0.61       419
        Draw       0.35      0.17      0.23       270
 Visitor win       0.45      0.31      0.37       244

    accuracy                           0.47       933
   macro avg       0.44      0.42      0.40       933
weighted avg       0.45      0.47      0.44       933



In [18]:
confusion_matrix(y_test, y_pred)

array([[320,  47,  52],
       [185,  47,  38],
       [129,  40,  75]], dtype=int64)

In [19]:
prediction_RF = pd.DataFrame([y_pred, y_test]).T
prediction_RF

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,2.0,2.0


In [20]:
prediction_RF = prediction_RF.rename(columns={0:'pred', 1:'test'})

In [21]:
prediction_RF['success'] = prediction_RF.apply(lambda col: success (col),axis=1)
prediction_RF.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,2.0,2.0,1
932,0.0,0.0,1


### KNeighbors

In [22]:
from sklearn.neighbors import KNeighborsClassifier
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model =KNeighborsClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.40943193997856375

In [23]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.47      0.62      0.53       419
        Draw       0.30      0.26      0.28       270
 Visitor win       0.38      0.22      0.28       244

    accuracy                           0.41       933
   macro avg       0.38      0.36      0.36       933
weighted avg       0.39      0.41      0.39       933



In [24]:
confusion_matrix(y_test, y_pred)

array([[259, 106,  54],
       [166,  70,  34],
       [130,  61,  53]], dtype=int64)

In [25]:
prediction_KN = pd.DataFrame([y_pred, y_test]).T
prediction_KN

Unnamed: 0,0,1
0,2.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,1.0,0.0
...,...,...
928,2.0,0.0
929,0.0,1.0
930,0.0,0.0
931,1.0,2.0


In [26]:
prediction_KN = prediction_KN.rename(columns={0:'pred', 1:'test'})

In [27]:
prediction_KN['success'] = prediction_KN.apply(lambda col: success (col),axis=1)
prediction_KN.tail()

Unnamed: 0,pred,test,success
928,2.0,0.0,0
929,0.0,1.0,0
930,0.0,0.0,1
931,1.0,2.0,0
932,0.0,0.0,1


### SVC

In [28]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = SVC()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4683815648445874

In [29]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.48      0.88      0.62       419
        Draw       0.34      0.09      0.14       270
 Visitor win       0.51      0.18      0.26       244

    accuracy                           0.47       933
   macro avg       0.44      0.38      0.34       933
weighted avg       0.45      0.47      0.39       933



In [30]:
confusion_matrix(y_test, y_pred)

array([[370,  29,  20],
       [225,  24,  21],
       [183,  18,  43]], dtype=int64)

In [31]:
prediction_SVC = pd.DataFrame([y_pred, y_test]).T
prediction_SVC

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,0.0,2.0


In [32]:
prediction_SVC = prediction_SVC.rename(columns={0:'pred', 1:'test'})

In [33]:
prediction_SVC['success'] = prediction_SVC.apply(lambda col: success (col),axis=1)
prediction_SVC.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,0.0,2.0,0
932,0.0,0.0,1


To check that there is no error and that there are more predictions than 0 

In [34]:
prediction_SVC['pred'].unique()

array([0., 2., 1.])

### Gaussian Proccess Classifier

In [35]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model =GaussianProcessClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4533762057877814

In [36]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.46      0.84      0.60       419
        Draw       0.33      0.10      0.15       270
 Visitor win       0.47      0.19      0.27       244

    accuracy                           0.45       933
   macro avg       0.42      0.37      0.34       933
weighted avg       0.43      0.45      0.38       933



In [37]:
confusion_matrix(y_test, y_pred)

array([[350,  39,  30],
       [221,  26,  23],
       [183,  14,  47]], dtype=int64)

In [38]:
prediction_GPC = pd.DataFrame([y_pred, y_test]).T
prediction_GPC

Unnamed: 0,0,1
0,2.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,1.0,2.0


In [39]:
prediction_GPC = prediction_GPC.rename(columns={0:'pred', 1:'test'})

In [40]:
prediction_GPC['success'] = prediction_GPC.apply(lambda col: success (col),axis=1)
prediction_GPC.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,1.0,2.0,0
932,0.0,0.0,1


### MLP Classifier

In [41]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = MLPClassifier(max_iter=1000)
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.41264737406216506

In [42]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.51      0.49      0.50       419
        Draw       0.32      0.33      0.32       270
 Visitor win       0.37      0.37      0.37       244

    accuracy                           0.41       933
   macro avg       0.40      0.40      0.40       933
weighted avg       0.41      0.41      0.41       933



In [43]:
confusion_matrix(y_test, y_pred)

array([[207, 121,  91],
       [117,  88,  65],
       [ 84,  70,  90]], dtype=int64)

In [44]:
prediction_MLP = pd.DataFrame([y_pred, y_test]).T
prediction_MLP

Unnamed: 0,0,1
0,2.0,0.0
1,1.0,1.0
2,1.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,1.0,2.0


In [45]:
prediction_MLP = prediction_MLP.rename(columns={0:'pred', 1:'test'})

In [46]:
prediction_MLP['success'] = prediction_MLP.apply(lambda col: success (col),axis=1)
prediction_MLP.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,1.0,2.0,0
932,0.0,0.0,1


### AdaBoost Classifier

In [47]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = AdaBoostClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4630225080385852

In [48]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.49      0.78      0.60       419
        Draw       0.30      0.14      0.19       270
 Visitor win       0.47      0.29      0.36       244

    accuracy                           0.46       933
   macro avg       0.42      0.40      0.38       933
weighted avg       0.43      0.46      0.42       933



In [49]:
confusion_matrix(y_test, y_pred)

array([[325,  54,  40],
       [193,  37,  40],
       [140,  34,  70]], dtype=int64)

In [50]:
prediction_ADA = pd.DataFrame([y_pred, y_test]).T
prediction_ADA

Unnamed: 0,0,1
0,2.0,0.0
1,2.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,2.0,2.0


In [51]:
prediction_ADA = prediction_ADA.rename(columns={0:'pred', 1:'test'})

In [52]:
prediction_ADA['success'] = prediction_ADA.apply(lambda col: success (col),axis=1)
prediction_ADA.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,2.0,2.0,1
932,0.0,0.0,1


###  GaussianNB 

In [53]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = GaussianNB()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.40514469453376206

In [54]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.65      0.26      0.38       419
        Draw       0.33      0.44      0.38       270
 Visitor win       0.36      0.61      0.46       244

    accuracy                           0.41       933
   macro avg       0.45      0.44      0.40       933
weighted avg       0.49      0.41      0.40       933



In [55]:
confusion_matrix(y_test, y_pred)

array([[111, 161, 147],
       [ 38, 118, 114],
       [ 21,  74, 149]], dtype=int64)

In [56]:
prediction_GNB = pd.DataFrame([y_pred, y_test]).T
prediction_GNB

Unnamed: 0,0,1
0,2.0,0.0
1,1.0,1.0
2,1.0,2.0
3,0.0,0.0
4,1.0,0.0
...,...,...
928,2.0,0.0
929,0.0,1.0
930,1.0,0.0
931,2.0,2.0


In [57]:
prediction_GNB = prediction_GNB.rename(columns={0:'pred', 1:'test'})

In [58]:
prediction_GNB['success'] = prediction_GNB.apply(lambda col: success (col),axis=1)
prediction_GNB.tail()

Unnamed: 0,pred,test,success
928,2.0,0.0,0
929,0.0,1.0,0
930,1.0,0.0,0
931,2.0,2.0,1
932,1.0,0.0,0


###  Quadratic Discriminant Analysis 

In [59]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = QuadraticDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)



0.3815648445873526

In [60]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.51      0.36      0.42       419
        Draw       0.32      0.25      0.28       270
 Visitor win       0.32      0.57      0.41       244

    accuracy                           0.38       933
   macro avg       0.38      0.39      0.37       933
weighted avg       0.41      0.38      0.38       933



In [61]:
confusion_matrix(y_test, y_pred)

array([[150,  95, 174],
       [ 87,  67, 116],
       [ 55,  50, 139]], dtype=int64)

In [62]:
prediction_QDA = pd.DataFrame([y_pred, y_test]).T
prediction_QDA

Unnamed: 0,0,1
0,1.0,0.0
1,2.0,1.0
2,2.0,2.0
3,2.0,0.0
4,2.0,0.0
...,...,...
928,1.0,0.0
929,0.0,1.0
930,2.0,0.0
931,1.0,2.0


In [63]:
prediction_QDA = prediction_QDA.rename(columns={0:'pred', 1:'test'})

In [64]:
prediction_QDA['success'] = prediction_QDA.apply(lambda col: success (col),axis=1)
prediction_QDA.tail()

Unnamed: 0,pred,test,success
928,1.0,0.0,0
929,0.0,1.0,0
930,2.0,0.0,0
931,1.0,2.0,0
932,1.0,0.0,0


The results obtained do not improve on those prior to feature engineering. 

Possible explanations for this result are, on the one hand, a possible overfitting problem. The model may incur this overfitting problem due to an excess of information that is too concrete, without having too large a volume of data and, therefore, learning too many details. 

On the other hand, an error may have occurred in the featurisation, but this section has been checked and the results are logical and there is no evidence that any error has been made.
The conclusion is that noise has most likely been introduced into the system with the featurisation and that is why the percentages are slightly lower.