In [2]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
from pandas import DataFrame
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


In [3]:
data = pd.read_csv('plotting_data', index_col = [0])
data.head()

Unnamed: 0,year,round,division,local_goals,visitor_goals,points_local,wins_local,draws_local,losses_local,gf_local,...,pos_local,points_visitor,wins_visitor,draws_visitor,losses_visitor,gf_visitor,ga_visitor,avg_visitor,pos_visitor,match_winner
0,2016,2,1,3,1,1,0.0,1.0,0.0,1,...,7,3,1.0,0.0,0.0,1,0,1.0,5,0
1,2016,2,1,0,0,1,0.0,1.0,0.0,0,...,10,1,0.0,1.0,0.0,0,0,0.0,13,1
2,2016,2,1,1,0,3,1.0,0.0,0.0,1,...,4,1,0.0,1.0,0.0,0,0,0.0,9,0
3,2016,2,1,3,0,3,1.0,0.0,0.0,2,...,2,1,0.0,1.0,0.0,0,0,0.0,11,0
4,2016,2,1,5,0,1,0.0,1.0,0.0,0,...,12,1,0.0,1.0,0.0,1,1,0.0,6,0


In [4]:
data['match_winner'].astype(str)

0       0
1       1
2       0
3       0
4       0
       ..
4657    0
4658    0
4659    2
4660    1
4661    0
Name: match_winner, Length: 4662, dtype: object

In [5]:
data

Unnamed: 0,year,round,division,local_goals,visitor_goals,points_local,wins_local,draws_local,losses_local,gf_local,...,pos_local,points_visitor,wins_visitor,draws_visitor,losses_visitor,gf_visitor,ga_visitor,avg_visitor,pos_visitor,match_winner
0,2016,2,1,3,1,1,0.0,1.0,0.0,1,...,7,3,1.0,0.0,0.0,1,0,1.0,5,0
1,2016,2,1,0,0,1,0.0,1.0,0.0,0,...,10,1,0.0,1.0,0.0,0,0,0.0,13,1
2,2016,2,1,1,0,3,1.0,0.0,0.0,1,...,4,1,0.0,1.0,0.0,0,0,0.0,9,0
3,2016,2,1,3,0,3,1.0,0.0,0.0,2,...,2,1,0.0,1.0,0.0,0,0,0.0,11,0
4,2016,2,1,5,0,1,0.0,1.0,0.0,0,...,12,1,0.0,1.0,0.0,1,1,0.0,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,2021,38,2,2,1,44,9.0,17.0,11.0,38,...,14,40,9.0,13.0,15.0,35,42,-7.0,19,0
4658,2021,38,2,1,0,39,9.0,12.0,16.0,36,...,20,41,11.0,8.0,18.0,35,43,-8.0,18,0
4659,2021,38,2,1,4,41,10.0,11.0,16.0,25,...,17,58,16.0,10.0,11.0,39,34,5.0,6,2
4660,2021,38,2,1,1,57,16.0,9.0,12.0,43,...,7,62,18.0,8.0,11.0,41,31,10.0,4,1


It is important to drop the columns referred to the goals from the match in order to do not give that infromation to the model.

In [6]:
data = data.drop(['local_goals', 'visitor_goals'], axis=1)

## Modelling

We start with models based on this data before performing feature engineering. We will observe which classifiers obtain the best prediction values, and based on them, we will have a base prediction result that we will try to improve.

In [7]:
#### Orden Cosas #

#-> Separar en train test split lo primero
#-> Hacer mix_max_scaler para X_train solo
#-> Entrenar con X_train y y_train
#-> Usar min_max_scaler con X_test
#-> Predecir con X_test para tener predictions y comparar con y_test


### Logistic Regression

In [8]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = LogisticRegression(max_iter = 500)
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.5074946466809421

In [9]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.51      0.88      0.64       218
        Empate       0.56      0.07      0.12       131
Gana Visitante       0.51      0.31      0.39       118

      accuracy                           0.51       467
     macro avg       0.52      0.42      0.38       467
  weighted avg       0.52      0.51      0.43       467



### Decision Tree

In [10]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train_scaled, y_train)

predictions_dt = model.predict(X_test)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.37259100642398285

In [11]:
confusion_matrix(y_test, predictions_dt)

array([[ 45,  47, 126],
       [ 24,  31,  76],
       [ 24,  44,  50]], dtype=int64)

In [12]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, predictions_dt, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.48      0.21      0.29       218
        Empate       0.25      0.24      0.25       131
Gana Visitante       0.20      0.42      0.27       118

      accuracy                           0.27       467
     macro avg       0.31      0.29      0.27       467
  weighted avg       0.35      0.27      0.27       467



### Random Forest

In [17]:
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4475374732334047

In [18]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.52      0.70      0.59       218
        Empate       0.29      0.21      0.24       131
Gana Visitante       0.38      0.25      0.30       118

      accuracy                           0.45       467
     macro avg       0.39      0.39      0.38       467
  weighted avg       0.42      0.45      0.42       467



### KNeighbors

In [21]:
from sklearn.neighbors import KNeighborsClassifier
features = data.values[:, :-1]
target = data.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model =KNeighborsClassifier()
model.fit(X_train_scaled, y_train)

predictions = model.predict(X_test)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.45610278372591007

In [22]:
target_names = ['Gana Local', 'Empate', 'Gana Visitante']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.50      0.67      0.58       218
        Empate       0.33      0.27      0.29       131
Gana Visitante       0.46      0.26      0.34       118

      accuracy                           0.46       467
     macro avg       0.43      0.40      0.40       467
  weighted avg       0.44      0.46      0.44       467



The best prediction result is obtained by one of the simplest models, Logistic Regression. With this model a 51% success rate is obtained in predicting results. Logistic Regression also obtains the best results predicting the categories of 'Local win' and 'Visitor win', obtaining also a very low accuracy in the prediction of 'Draw' (only 12%). 

It can be observed that for all the classifiers the most difficult category to predict is 'Draw', the model that obtains the best results for this category is KNeighbors (29%), which is the second classifier with the best overall results (46% prediction accuracy).



In [None]:
#se debe a que logsitic regression funciona mejor con modelos mas basicos, ante la falta de datos funciona mejor en nuestro modelo