In [50]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def target(FTHG, FTAG):
    if (FTHG > FTAG):
        return 1
    return 0

leauge = "N1"

data = [
    f"https://www.football-data.co.uk/mmz4281/1920/{leauge}.csv",
    f"https://www.football-data.co.uk/mmz4281/2021/{leauge}.csv",
    f"https://www.football-data.co.uk/mmz4281/2122/{leauge}.csv"
]

li = []

for leauge in data:
    df = pd.read_csv(leauge)
    li.append(df)

df = pd.concat(li)

df = df[["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "B365CH", "B365CD", "B365CA", "B365C>2.5", "B365C<2.5"]]
df["target"] = np.vectorize(target)(df["FTHG"], df["FTAG"])

print(df.isnull().sum().sum())
df = df.dropna()

df = df.reset_index(drop=True)

df.tail(10)

2


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365CH,B365CD,B365CA,B365C>2.5,B365C<2.5,target
768,13/03/2022,AZ Alkmaar,Twente,0,1,1.85,3.4,4.5,2.0,1.85,0
769,18/03/2022,Heerenveen,Heracles,2,0,2.2,3.3,3.3,2.1,1.7,1
770,19/03/2022,Go Ahead Eagles,Cambuur,3,0,1.9,3.6,3.75,1.88,1.98,1
771,19/03/2022,Twente,Zwolle,1,0,1.57,4.0,6.0,1.9,1.95,1
772,19/03/2022,Nijmegen,Sparta Rotterdam,0,0,2.45,3.3,2.9,2.1,1.7,0
773,20/03/2022,Utrecht,Groningen,1,3,1.85,3.6,4.33,1.98,1.88,0
774,20/03/2022,Ajax,Feyenoord,3,2,1.4,5.0,7.5,1.44,2.7,1
775,20/03/2022,PSV Eindhoven,For Sittard,5,0,1.12,8.5,19.0,1.25,4.0,1
776,20/03/2022,Vitesse,Waalwijk,1,2,1.53,4.33,5.75,1.72,2.07,0
777,20/03/2022,Willem II,AZ Alkmaar,2,2,5.5,4.0,1.61,2.05,1.8,0


In [51]:
team_data = pd.get_dummies(df[["HomeTeam", "AwayTeam"]])
odds = df[["B365CH", "B365CD", "B365CA", "B365C>2.5", "B365C<2.5"]]

X = pd.concat([team_data, odds], axis=1)
y = df["target"]

test_size_1 = int(0.8 * len(X))

X_train = X[:test_size_1]
y_train = y[:test_size_1]

X_other = X[test_size_1:]
y_other = y[test_size_1:]

print(X_train.shape, y_train.shape, X_other.shape, y_other.shape)

test_size_2 = int(0.5 * len(X_other))

X_val = X_other[:test_size_2]
y_val = y_other[:test_size_2]

X_test = X_other[test_size_2:]
y_test = y_other[test_size_2:]

print(X_val.shape, y_val.shape, X_test.shape, y_test.shape)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

(622, 47) (622,) (156, 47) (156,)
(78, 47) (78,) (78, 47) (78,)


In [52]:
start = 0.001
end = 2
jump = 0.001

scores = []
for c in np.arange(start, end + jump, jump):
    model = LogisticRegression(C=c)
    model.fit(X_train, y_train);
    score = model.score(X_val, y_val)
    scores.append(score)
    
best_index = np.argmax(scores)

print(best_index + (start / jump))
print((best_index + (start / jump)) * jump)
print(scores[best_index])

43.0
0.043000000000000003
0.7692307692307693


In [53]:
model = LogisticRegression(C=0.043)
model.fit(X_train, y_train);

print(classification_report(y_val, model.predict(X_val)))

y_val.value_counts() / len(y_val)

              precision    recall  f1-score   support

           0       0.74      0.93      0.83        46
           1       0.85      0.53      0.65        32

    accuracy                           0.77        78
   macro avg       0.80      0.73      0.74        78
weighted avg       0.79      0.77      0.76        78



0    0.589744
1    0.410256
Name: target, dtype: float64

In [54]:
print(classification_report(y_test, model.predict(X_test)))

y_test.value_counts() / len(y_test)

              precision    recall  f1-score   support

           0       0.67      0.89      0.77        46
           1       0.71      0.38      0.49        32

    accuracy                           0.68        78
   macro avg       0.69      0.63      0.63        78
weighted avg       0.69      0.68      0.65        78



0    0.589744
1    0.410256
Name: target, dtype: float64

In [40]:
test = pd.DataFrame(columns=team_data.columns, data=np.zeros(len(team_data.columns)).reshape(1, len(team_data.columns)))

In [31]:
test["HomeTeam_Napoli"] = 1
test["AwayTeam_Udinese"] = 1

test["B365CH"] = 1.53
test["B365CD"] = 4.2
test["B365CA"] = 6
test["B365C>2.5"] = 1.84
test["B365C<2.5"] = 2.06

test = scaler.transform(test)

model.predict_proba(test)

array([[0.47244433, 0.52755567]])

In [32]:
1 / model.predict_proba(test)

array([[2.11665152, 1.89553454]])