In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def target(FTHG, FTAG):
    if (FTHG > FTAG):
        return 1
    return 0

leauge = "E0"

data = [
    f"https://www.football-data.co.uk/mmz4281/1920/{leauge}.csv",
    f"https://www.football-data.co.uk/mmz4281/2021/{leauge}.csv",
    f"https://www.football-data.co.uk/mmz4281/2122/{leauge}.csv"
]

li = []

for leauge in data:
    df = pd.read_csv(leauge)
    li.append(df)

df = pd.concat(li)

df = df[["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "B365CH", "B365CD", "B365CA", "B365C>2.5", "B365C<2.5"]]
df["target"] = np.vectorize(target)(df["FTHG"], df["FTAG"])
df = df.reset_index(drop=True)

print(df.isnull().sum().sum())
df = df.dropna()

df.tail()

0


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365CH,B365CD,B365CA,B365C>2.5,B365C<2.5,target
1039,13/03/2022,Arsenal,Leicester,2,0,1.4,4.75,7.0,1.66,2.2,1
1040,14/03/2022,Crystal Palace,Man City,0,0,9.0,5.25,1.33,1.66,2.2,0
1041,16/03/2022,Brighton,Tottenham,0,2,3.5,3.4,2.1,2.1,1.72,0
1042,16/03/2022,Arsenal,Liverpool,0,2,3.5,3.75,2.0,1.72,2.1,0
1043,17/03/2022,Everton,Newcastle,1,0,2.4,3.25,3.1,2.2,1.66,1


In [2]:
team_data = pd.get_dummies(df[["HomeTeam", "AwayTeam"]])
odds = df[["B365CH", "B365CD", "B365CA", "B365C>2.5", "B365C<2.5"]]

X = pd.concat([team_data, odds], axis=1)
y = df["target"]

test_size_1 = 0.7
test_size_2 = 0.5

X_train = X[:int(test_size_1 * (len(X)))]
X_other = X[int(test_size_1 * (len(X))):]

y_train = y[:int(test_size_1 * (len(y)))]
y_other = y[int(test_size_1 * (len(y))):]

print(X_train.shape, X_other.shape, y_train.shape, y_other.shape)

X_val = X_other[:int(test_size_2 * (len(X_other)))]
X_test = X_other[int(test_size_2 * (len(X_other))):]

y_val = y_other[:int(test_size_2 * (len(y_other)))]
y_test = y_other[int(test_size_2 * (len(y_other))):]

print(X_val.shape, X_test.shape, y_val.shape, y_test.shape)

(730, 53) (314, 53) (730,) (314,)
(157, 53) (157, 53) (157,) (157,)


In [3]:
scores = []
for i in np.arange(1, 301, 1):
    model = RandomForestClassifier(random_state=101, n_estimators=i)
    model.fit(X_train, y_train);
    score = model.score(X_val, y_val)
    scores.append(score)
    
print(np.argmax(scores) + 1)

24


In [4]:
model = RandomForestClassifier(random_state=101, n_estimators=24)
model.fit(X_train, y_train);
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

           0       0.74      0.77      0.76        92
           1       0.66      0.62      0.63        65

    accuracy                           0.71       157
   macro avg       0.70      0.69      0.70       157
weighted avg       0.70      0.71      0.71       157



In [5]:
y_val.value_counts() / len(X_val)

0    0.585987
1    0.414013
Name: target, dtype: float64

In [6]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.68      0.80      0.73        91
           1       0.63      0.47      0.54        66

    accuracy                           0.66       157
   macro avg       0.65      0.64      0.64       157
weighted avg       0.66      0.66      0.65       157



In [8]:
y_test.value_counts() / len(X_test)

0    0.579618
1    0.420382
Name: target, dtype: float64

In [13]:
test = pd.DataFrame(columns=team_data.columns, data=np.zeros(len(team_data.columns)).reshape(1, len(team_data.columns)))

In [49]:
test["HomeTeam_Napoli"] = 1
test["AwayTeam_Udinese"] = 1

test["B365CH"] = 1.53
test["B365CD"] = 4.2
test["B365CA"] = 6
test["B365C>2.5"] = 1.84
test["B365C<2.5"] = 2.06

model.predict_proba(test)

array([[0.64220183, 0.28440367, 0.0733945 ]])

In [50]:
1 / model.predict_proba(test)

array([[ 1.55714286,  3.51612903, 13.625     ]])