In [23]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
from pandas import DataFrame
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


## Loading data

For this notebook we resort again to the data set used for plotting, as it will also be used for modelling in this section.

In [24]:
data = pd.read_csv('plotting_data', index_col = [0])
data.head()

Unnamed: 0,year,round,division,local_goals,visitor_goals,points_local,wins_local,draws_local,losses_local,gf_local,...,pos_local,points_visitor,wins_visitor,draws_visitor,losses_visitor,gf_visitor,ga_visitor,avg_visitor,pos_visitor,match_winner
0,2016,2,1,3,1,1,0.0,1.0,0.0,1,...,7,3,1.0,0.0,0.0,1,0,1.0,5,0
1,2016,2,1,0,0,1,0.0,1.0,0.0,0,...,10,1,0.0,1.0,0.0,0,0,0.0,13,1
2,2016,2,1,1,0,3,1.0,0.0,0.0,1,...,4,1,0.0,1.0,0.0,0,0,0.0,9,0
3,2016,2,1,3,0,3,1.0,0.0,0.0,2,...,2,1,0.0,1.0,0.0,0,0,0.0,11,0
4,2016,2,1,5,0,1,0.0,1.0,0.0,0,...,12,1,0.0,1.0,0.0,1,1,0.0,6,0


It is important to drop the columns referred to the goals from the match in order to do not give this information to the model, as these columns contain information related to the result of the match, namely the goals scored by each team.

In [26]:
data = data.drop(['local_goals', 'visitor_goals'], axis=1)

## Modelling

We start with models based on this data before performing feature engineering. We will observe which classifiers obtain the best prediction values, and based on them, we will have a base prediction result that we will try to improve.

### Logistic Regression

In [27]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = LogisticRegression(max_iter = 500)
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.5074946466809421

In [28]:
confusion_matrix(y_test, y_pred)

array([[191,   5,  22],
       [108,   9,  14],
       [ 79,   2,  37]], dtype=int64)

In [29]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.51      0.88      0.64       218
        Draw       0.56      0.07      0.12       131
 Visitor win       0.51      0.31      0.39       118

    accuracy                           0.51       467
   macro avg       0.52      0.42      0.38       467
weighted avg       0.52      0.51      0.43       467



### Decision Tree

In [30]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train_scaled, y_train)

predictions_dt = model.predict(X_test)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.3747323340471092

In [31]:
confusion_matrix(y_test, y_pred)

array([[90, 72, 56],
       [51, 39, 41],
       [37, 35, 46]], dtype=int64)

In [32]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, predictions_dt, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.49      0.21      0.29       218
        Draw       0.25      0.22      0.24       131
 Visitor win       0.19      0.42      0.26       118

    accuracy                           0.27       467
   macro avg       0.31      0.28      0.26       467
weighted avg       0.35      0.27      0.27       467



### Random Forest

In [33]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.44539614561027835

In [34]:
confusion_matrix(y_test, y_pred)

array([[151,  38,  29],
       [ 85,  25,  21],
       [ 61,  25,  32]], dtype=int64)

In [35]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.51      0.69      0.59       218
        Draw       0.28      0.19      0.23       131
 Visitor win       0.39      0.27      0.32       118

    accuracy                           0.45       467
   macro avg       0.39      0.38      0.38       467
weighted avg       0.42      0.45      0.42       467



### KNeighbors

In [36]:
from sklearn.neighbors import KNeighborsClassifier
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model =KNeighborsClassifier()
model.fit(X_train_scaled, y_train)

predictions = model.predict(X_test)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.45610278372591007

In [37]:
confusion_matrix(y_test, y_pred)

array([[147,  51,  20],
       [ 80,  35,  16],
       [ 66,  21,  31]], dtype=int64)

In [38]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   Local win       0.50      0.67      0.58       218
        Draw       0.33      0.27      0.29       131
 Visitor win       0.46      0.26      0.34       118

    accuracy                           0.46       467
   macro avg       0.43      0.40      0.40       467
weighted avg       0.44      0.46      0.44       467



The best prediction result is obtained by one of the simplest models, Logistic Regression. With this model a 51% success rate is obtained in predicting results. Logistic Regression also obtains the best results predicting the categories of 'Local win' and 'Visitor win', obtaining also a very low accuracy in the prediction of 'Draw' (only 12%). 

It can be observed that for all the classifiers the most difficult category to predict is 'Draw', the model that obtains the best results for this category is KNeighbors (29%), which is the second classifier with the best overall results (46% prediction accuracy).

The logic of these results may be due to the fact that the dataset is not very large, and therefore does not have a sufficiently large number of examples to learn more complex patterns. That is why the model that gives the best results is the simplest one, such as logistic regression.

