In [2]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [3]:
df_modelling = pd.read_csv("featured data", index_col = [0])


## Modelling

Once the dataset has been featurised, we model with different classifiers and analyse the results obtained for each of them. 

### Logistic Regression

In [3]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = LogisticRegression(max_iter = 500)
model.fit(X_train_scaled, y_train)


X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4790996784565916

In [4]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.51      0.75      0.61       419
        Empate       0.36      0.20      0.25       270
Gana Visitante       0.47      0.33      0.39       244

      accuracy                           0.48       933
     macro avg       0.45      0.43      0.42       933
  weighted avg       0.46      0.48      0.45       933



In [5]:
confusion_matrix(y_test, y_pred)

array([[313,  57,  49],
       [176,  53,  41],
       [126,  37,  81]], dtype=int64)

In [6]:
prediction_LR = pd.DataFrame([y_pred, y_test]).T
prediction_LR

Unnamed: 0,0,1
0,0.0,0.0
1,1.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,1.0,2.0


In [7]:
prediction_LR = prediction_LR.rename(columns={0:'pred', 1:'test'})

In [8]:
def success(col):
    if col[0] == col[1]:
        return 1
    else:
        return 0

In [9]:
prediction_LR['success'] = prediction_LR.apply(lambda col: success (col),axis=1)
prediction_LR.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,1.0,2.0,0
932,1.0,0.0,0


In [10]:
prediction_LR.to_excel('LogisticRegression prediction.xls', )

### Decision Tree

In [11]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.3965702036441586

In [12]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.50      0.50      0.50       419
        Empate       0.29      0.29      0.29       270
Gana Visitante       0.33      0.34      0.34       244

      accuracy                           0.40       933
     macro avg       0.38      0.38      0.38       933
  weighted avg       0.40      0.40      0.40       933



In [13]:
confusion_matrix(y_test, y_pred)

array([[209, 110, 100],
       [126,  77,  67],
       [ 82,  78,  84]], dtype=int64)

In [14]:
prediction_DT = pd.DataFrame([y_pred, y_test]).T
prediction_DT

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,1.0,0.0
931,2.0,2.0


In [15]:
prediction_DT = prediction_LR.rename(columns={0:'pred', 1:'test'})

In [16]:
prediction_DT['success'] = prediction_DT.apply(lambda col: success (col),axis=1)
prediction_DT.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,1.0,2.0,0
932,1.0,0.0,0


In [17]:
prediction_DT.to_excel('DecissionTree prediction.xls')

### Random Forest

In [18]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4683815648445874

In [19]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.50      0.77      0.61       419
        Empate       0.38      0.17      0.24       270
Gana Visitante       0.42      0.27      0.33       244

      accuracy                           0.47       933
     macro avg       0.43      0.41      0.39       933
  weighted avg       0.44      0.47      0.43       933



In [20]:
confusion_matrix(y_test, y_pred)

array([[324,  39,  56],
       [186,  46,  38],
       [141,  36,  67]], dtype=int64)

In [21]:
prediction_RF = pd.DataFrame([y_pred, y_test]).T
prediction_RF

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,1.0,0.0
931,2.0,2.0


In [22]:
prediction_RF = prediction_RF.rename(columns={0:'pred', 1:'test'})

In [23]:
prediction_RF['success'] = prediction_RF.apply(lambda col: success (col),axis=1)
prediction_RF.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,1.0,0.0,0
931,2.0,2.0,1
932,1.0,0.0,0


In [24]:
prediction_RF.to_excel('RandomForest prediction.xls')

### KNeighbors

In [25]:
from sklearn.neighbors import KNeighborsClassifier
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model =KNeighborsClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4255091103965702

In [26]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.48      0.64      0.55       419
        Empate       0.33      0.28      0.30       270
Gana Visitante       0.38      0.22      0.28       244

      accuracy                           0.43       933
     macro avg       0.39      0.38      0.37       933
  weighted avg       0.41      0.43      0.40       933



In [27]:
confusion_matrix(y_test, y_pred)

array([[269,  99,  51],
       [158,  75,  37],
       [137,  54,  53]], dtype=int64)

In [28]:
prediction_KN = pd.DataFrame([y_pred, y_test]).T
prediction_KN

Unnamed: 0,0,1
0,2.0,0.0
1,2.0,1.0
2,0.0,2.0
3,0.0,0.0
4,1.0,0.0
...,...,...
928,2.0,0.0
929,0.0,1.0
930,0.0,0.0
931,2.0,2.0


In [29]:
prediction_KN = prediction_KN.rename(columns={0:'pred', 1:'test'})

In [30]:
prediction_KN['success'] = prediction_KN.apply(lambda col: success (col),axis=1)
prediction_KN.tail()

Unnamed: 0,pred,test,success
928,2.0,0.0,0
929,0.0,1.0,0
930,0.0,0.0,1
931,2.0,2.0,1
932,0.0,0.0,1


In [31]:
prediction_KN.to_excel('KNeighbors prediction.xls')

### SVC

In [32]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = SVC()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.46195069667738475

In [33]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.47      0.87      0.61       419
        Empate       0.30      0.08      0.13       270
Gana Visitante       0.52      0.18      0.27       244

      accuracy                           0.46       933
     macro avg       0.43      0.38      0.34       933
  weighted avg       0.43      0.46      0.38       933



In [34]:
confusion_matrix(y_test, y_pred)

array([[365,  33,  21],
       [229,  22,  19],
       [181,  19,  44]], dtype=int64)

In [35]:
prediction_SVC = pd.DataFrame([y_pred, y_test]).T
prediction_SVC

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,0.0,2.0


In [36]:
prediction_SVC = prediction_SVC.rename(columns={0:'pred', 1:'test'})

In [37]:
prediction_SVC['success'] = prediction_SVC.apply(lambda col: success (col),axis=1)
prediction_SVC.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,0.0,2.0,0
932,0.0,0.0,1


In [38]:
prediction_SVC.to_excel('SVC prediction.xls')

In [39]:
prediction_SVC['pred'].unique()

array([0., 2., 1.])

### Gaussian Proccess Classifier

In [40]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model =GaussianProcessClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4640943193997856

In [41]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.47      0.85      0.61       419
        Empate       0.35      0.10      0.15       270
Gana Visitante       0.51      0.20      0.29       244

      accuracy                           0.46       933
     macro avg       0.44      0.38      0.35       933
  weighted avg       0.44      0.46      0.39       933



In [42]:
confusion_matrix(y_test, y_pred)

array([[358,  36,  25],
       [221,  26,  23],
       [182,  13,  49]], dtype=int64)

In [43]:
prediction_GPC = pd.DataFrame([y_pred, y_test]).T
prediction_GPC

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,1.0,2.0


In [44]:
prediction_GPC = prediction_GPC.rename(columns={0:'pred', 1:'test'})

In [45]:
prediction_GPC['success'] = prediction_GPC.apply(lambda col: success (col),axis=1)
prediction_GPC.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,1.0,2.0,0
932,0.0,0.0,1


In [46]:
prediction_GPC.to_excel('GPC prediction.xls')

In [47]:
prediction_GPC['pred'].unique()

array([0., 2., 1.])

### MLP Classifier

In [48]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = MLPClassifier(max_iter=1000)
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.43729903536977494

In [214]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.49      0.51      0.50       419
        Empate       0.30      0.28      0.29       270
Gana Visitante       0.34      0.35      0.35       244

      accuracy                           0.40       933
     macro avg       0.38      0.38      0.38       933
  weighted avg       0.40      0.40      0.40       933



In [215]:
confusion_matrix(y_test, y_pred)

array([[213, 114,  92],
       [123,  76,  71],
       [ 99,  60,  85]], dtype=int64)

In [216]:
prediction_MLP = pd.DataFrame([y_pred, y_test]).T
prediction_MLP

Unnamed: 0,0,1
0,2.0,0.0
1,2.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,2.0,0.0
929,2.0,1.0
930,0.0,0.0
931,2.0,2.0


In [217]:
prediction_MLP = prediction_MLP.rename(columns={0:'pred', 1:'test'})

In [218]:
prediction_MLP['success'] = prediction_MLP.apply(lambda col: success (col),axis=1)
prediction_MLP.tail()

Unnamed: 0,pred,test,success
928,2.0,0.0,0
929,2.0,1.0,0
930,0.0,0.0,1
931,2.0,2.0,1
932,2.0,0.0,0


In [219]:
prediction_MLP.to_excel('MLP prediction.xls')

### AdaBoost Classifier

In [220]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = AdaBoostClassifier()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4737406216505895

In [221]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.50      0.80      0.61       419
        Empate       0.31      0.13      0.19       270
Gana Visitante       0.50      0.30      0.37       244

      accuracy                           0.47       933
     macro avg       0.44      0.41      0.39       933
  weighted avg       0.44      0.47      0.43       933



In [222]:
confusion_matrix(y_test, y_pred)

array([[334,  49,  36],
       [198,  36,  36],
       [141,  31,  72]], dtype=int64)

In [223]:
prediction_ADA = pd.DataFrame([y_pred, y_test]).T
prediction_ADA

Unnamed: 0,0,1
0,0.0,0.0
1,2.0,1.0
2,0.0,2.0
3,0.0,0.0
4,0.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,0.0,0.0
931,2.0,2.0


In [224]:
prediction_ADA = prediction_ADA.rename(columns={0:'pred', 1:'test'})

In [225]:
prediction_ADA['success'] = prediction_ADA.apply(lambda col: success (col),axis=1)
prediction_ADA.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,0.0,0.0,1
931,2.0,2.0,1
932,0.0,0.0,1


In [226]:
prediction_ADA.to_excel('ADA prediction.xls')

###  GaussianNB 

In [227]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = GaussianNB()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)

0.4040728831725616

In [228]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.65      0.26      0.38       419
        Empate       0.33      0.43      0.38       270
Gana Visitante       0.37      0.61      0.46       244

      accuracy                           0.40       933
     macro avg       0.45      0.44      0.40       933
  weighted avg       0.48      0.40      0.40       933



In [229]:
confusion_matrix(y_test, y_pred)

array([[111, 163, 145],
       [ 39, 117, 114],
       [ 21,  74, 149]], dtype=int64)

In [230]:
prediction_GNB = pd.DataFrame([y_pred, y_test]).T
prediction_GNB

Unnamed: 0,0,1
0,2.0,0.0
1,1.0,1.0
2,1.0,2.0
3,0.0,0.0
4,1.0,0.0
...,...,...
928,2.0,0.0
929,0.0,1.0
930,1.0,0.0
931,2.0,2.0


In [231]:
prediction_GNB = prediction_GNB.rename(columns={0:'pred', 1:'test'})

In [232]:
prediction_GNB['success'] = prediction_GNB.apply(lambda col: success (col),axis=1)
prediction_GNB.tail()

Unnamed: 0,pred,test,success
928,2.0,0.0,0
929,0.0,1.0,0
930,1.0,0.0,0
931,2.0,2.0,1
932,1.0,0.0,0


In [233]:
prediction_GNB.to_excel('GNB prediction.xls')

###  Quadratic Discriminant Analysis 

In [234]:
features = df_modelling.values[:, :-1]
target = df_modelling.values[:, -1]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = QuadraticDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

y_pred = model.predict(X_test_scaled)

np.mean(y_pred == y_test)



0.3783494105037513

In [235]:
target_names = ['Local win', 'Draw', 'Visitor win']
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

    Gana Local       0.58      0.27      0.37       419
        Empate       0.31      0.53      0.39       270
Gana Visitante       0.35      0.40      0.37       244

      accuracy                           0.38       933
     macro avg       0.41      0.40      0.38       933
  weighted avg       0.44      0.38      0.38       933



In [236]:
confusion_matrix(y_test, y_pred)

array([[113, 207,  99],
       [ 44, 142,  84],
       [ 38, 108,  98]], dtype=int64)

In [237]:
prediction_QDA = pd.DataFrame([y_pred, y_test]).T
prediction_QDA

Unnamed: 0,0,1
0,2.0,0.0
1,1.0,1.0
2,2.0,2.0
3,0.0,0.0
4,1.0,0.0
...,...,...
928,0.0,0.0
929,0.0,1.0
930,1.0,0.0
931,2.0,2.0


In [238]:
prediction_QDA = prediction_QDA.rename(columns={0:'pred', 1:'test'})

In [239]:
prediction_QDA['success'] = prediction_QDA.apply(lambda col: success (col),axis=1)
prediction_QDA.tail()

Unnamed: 0,pred,test,success
928,0.0,0.0,1
929,0.0,1.0,0
930,1.0,0.0,0
931,2.0,2.0,1
932,1.0,0.0,0


In [240]:
prediction_QDA.to_excel('QDA prediction.xls')

We  create a dataframe which to collect all this results in order to be used in Tableau during the front-end. 

In [241]:
Model_results = pd.DataFrame()

In [242]:
report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
df = pd.DataFrame(report).T
df

Unnamed: 0,precision,recall,f1-score,support
Gana Local,0.579487,0.26969,0.368078,419.0
Empate,0.310722,0.525926,0.390646,270.0
Gana Visitante,0.348754,0.401639,0.373333,244.0
accuracy,0.378349,0.378349,0.378349,0.378349
macro avg,0.412988,0.399085,0.377353,933.0
weighted avg,0.441368,0.378349,0.375984,933.0


In [243]:
QDA_results = df.rename_axis('class').reset_index()
QDA_results['model'] = 'QDA' 
QDA_results

Unnamed: 0,class,precision,recall,f1-score,support,model
0,Gana Local,0.579487,0.26969,0.368078,419.0,QDA
1,Empate,0.310722,0.525926,0.390646,270.0,QDA
2,Gana Visitante,0.348754,0.401639,0.373333,244.0,QDA
3,accuracy,0.378349,0.378349,0.378349,0.378349,QDA
4,macro avg,0.412988,0.399085,0.377353,933.0,QDA
5,weighted avg,0.441368,0.378349,0.375984,933.0,QDA
