In [1]:
import pandas as pd
import numpy as np

from statistics import mean

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import classification_report

# Import dataset

In [2]:
compressed_final = pd.read_csv("../DataFormating/compressed_final.csv")


# Setup `X`, `y` data for training / testing

In [3]:
compressed_final.columns[:10]

Index(['Stage', 'Home Team Name', 'Home Team Goals', 'Away Team Goals',
       'Away Team Name', 'Attendance', 'Half-time Home Goals',
       'Half-time Away Goals', 'Home Team Initials', 'Away Team Initials'],
      dtype='object')

In [4]:
X = compressed_final.drop(["Home Team Goals", "Away Team Goals", 
                           "Half-time Home Goals", "Half-time Away Goals", 
                           "Home Team Initials", "Away Team Initials"], axis=1)

y = []

In [5]:
for i in range(len(compressed_final)):
    home_team_goals = compressed_final.iloc[i]["Home Team Goals"]
    away_team_goals = compressed_final.iloc[i]["Away Team Goals"]
    
    if home_team_goals > away_team_goals:
        y.append(1)
    elif home_team_goals < away_team_goals:
        y.append(2)
    else:
        y.append(0)

In [6]:
# Test
assert len(X) == len(y)

### Encode textual features from the `X` dataset

In [7]:
X["Stage"] = LabelEncoder().fit_transform(X["Stage"])
X["Home Team Name"] = LabelEncoder().fit_transform(X["Home Team Name"])
X["Away Team Name"] = LabelEncoder().fit_transform(X["Away Team Name"])

In [8]:
len(X.columns)

413

# Fast testing

In [9]:
mean(
    cross_validate(
        MLPClassifier(hidden_layer_sizes=(500, 500)),
        X, y, cv=50
    )["test_score"]
)



0.35966666666666663

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [11]:
model = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train)
y_pred = model.predict(X_test)

In [12]:
print( classification_report(y_test, y_pred) )

             precision    recall  f1-score   support

          0       0.50      0.09      0.15        11
          1       0.32      0.92      0.47        13
          2       0.71      0.22      0.33        23

avg / total       0.55      0.38      0.33        47

