In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

data = [
    # England
    "https://www.football-data.co.uk/mmz4281/1920/E0.csv",
    "https://www.football-data.co.uk/mmz4281/2021/E0.csv",
    "https://www.football-data.co.uk/mmz4281/2122/E0.csv",
    # Spain
    "https://www.football-data.co.uk/mmz4281/1920/SP1.csv",
    "https://www.football-data.co.uk/mmz4281/2021/SP1.csv",
    "https://www.football-data.co.uk/mmz4281/2122/SP1.csv",
    # Germany
    "https://www.football-data.co.uk/mmz4281/1920/D1.csv",
    "https://www.football-data.co.uk/mmz4281/2021/D1.csv",
    "https://www.football-data.co.uk/mmz4281/2122/D1.csv",
    # Italy
    "https://www.football-data.co.uk/mmz4281/1920/I1.csv",
    "https://www.football-data.co.uk/mmz4281/2021/I1.csv",
    "https://www.football-data.co.uk/mmz4281/2122/I1.csv",
    # France
    "https://www.football-data.co.uk/mmz4281/1920/F1.csv",
    "https://www.football-data.co.uk/mmz4281/2021/F1.csv",
    "https://www.football-data.co.uk/mmz4281/2122/F1.csv",
    # Portugal
    "https://www.football-data.co.uk/mmz4281/1920/P1.csv",
    "https://www.football-data.co.uk/mmz4281/2021/P1.csv",
    "https://www.football-data.co.uk/mmz4281/2122/P1.csv",
    # Holland
    "https://www.football-data.co.uk/mmz4281/1920/N1.csv",
    "https://www.football-data.co.uk/mmz4281/2021/N1.csv",
    "https://www.football-data.co.uk/mmz4281/2122/N1.csv",
]

def target(FTHG, FTAG):
    if (FTHG > FTAG):
        return 1
    return 0

li = []

for leauge in data:
    df = pd.read_csv(leauge)
    li.append(df)

df = pd.concat(li)

df = df[["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "B365CH", "B365CD", "B365CA", "B365C>2.5", "B365C<2.5"]]

print(df.isnull().sum().sum())
df = df.dropna()

df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
df = df.sort_values("Date")

df["target"] = np.vectorize(target)(df["FTHG"], df["FTAG"])

df = df.reset_index(drop=True)

df.tail(10)

7


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365CH,B365CD,B365CA,B365C>2.5,B365C<2.5,target
6575,2022-03-20,Bordeaux,Montpellier,0,2,2.45,3.4,2.87,1.92,1.98,0
6576,2022-03-20,Lorient,Strasbourg,0,0,3.4,3.3,2.2,2.37,1.57,0
6577,2022-03-20,Rennes,Metz,6,1,1.25,5.75,13.0,1.66,2.2,1
6578,2022-03-20,Reims,Lyon,0,0,4.2,3.6,1.85,1.99,1.91,0
6579,2022-03-20,Marseille,Nice,2,1,1.9,3.5,4.2,2.01,1.89,1
6580,2022-03-20,Gil Vicente,Maritimo,1,1,1.66,3.6,5.0,2.0,1.85,0
6581,2022-03-20,Pacos Ferreira,Moreirense,2,1,2.4,3.2,2.9,2.4,1.53,1
6582,2022-03-20,Benfica,Estoril,2,1,1.33,4.75,9.0,1.57,2.35,1
6583,2022-03-20,Boavista,Porto,0,1,8.5,4.75,1.33,1.66,2.15,0
6584,2022-03-20,Willem II,AZ Alkmaar,2,2,5.5,4.0,1.61,2.05,1.8,0


In [2]:
team_data = pd.get_dummies(df[["HomeTeam", "AwayTeam"]])
odds = df[["B365CH", "B365CD", "B365CA", "B365C>2.5", "B365C<2.5"]]

X = pd.concat([team_data, odds], axis=1)
y = df["target"]

test_size_1 = int(0.8 * len(X))

X_train = X[:test_size_1]
y_train = y[:test_size_1]

X_other = X[test_size_1:]
y_other = y[test_size_1:]

print(X_train.shape, y_train.shape, X_other.shape, y_other.shape)

test_size_2 = int(0.5 * len(X_other))

X_val = X_other[:test_size_2]
y_val = y_other[:test_size_2]

X_test = X_other[test_size_2:]
y_test = y_other[test_size_2:]

print(X_val.shape, y_val.shape, X_test.shape, y_test.shape)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

(5268, 333) (5268,) (1317, 333) (1317,)
(658, 333) (658,) (659, 333) (659,)


In [3]:
start = 0.001
end = 0.5
jump = 0.001

scores = []
for c in np.arange(start, end + jump, jump):
    model = LogisticRegression(C=c)
    model.fit(X_train, y_train);
    score = model.score(X_val, y_val)
    scores.append(score)
    
best_index = np.argmax(scores)

print(best_index + (start / jump))
print((best_index + (start / jump)) * jump)
print(scores[best_index])

62.0
0.062
0.6884498480243161


In [4]:
model = LogisticRegression(C=0.062)
model.fit(X_train, y_train);

print(classification_report(y_val, model.predict(X_val)))

y_val.value_counts() / len(y_val)

              precision    recall  f1-score   support

           0       0.68      0.89      0.77       381
           1       0.73      0.42      0.53       277

    accuracy                           0.69       658
   macro avg       0.70      0.65      0.65       658
weighted avg       0.70      0.69      0.67       658



0    0.579027
1    0.420973
Name: target, dtype: float64

In [5]:
print(classification_report(y_test, model.predict(X_test)))

y_test.value_counts() / len(y_test)

              precision    recall  f1-score   support

           0       0.67      0.90      0.77       385
           1       0.73      0.37      0.49       274

    accuracy                           0.68       659
   macro avg       0.70      0.64      0.63       659
weighted avg       0.69      0.68      0.65       659



0    0.584219
1    0.415781
Name: target, dtype: float64

In [6]:
test = pd.DataFrame(columns=team_data.columns, data=np.zeros(len(team_data.columns)).reshape(1, len(team_data.columns)))

In [7]:
test["HomeTeam_Man United"] = 1
test["AwayTeam_Leicester"] = 1

test["B365CH"] = 1.53
test["B365CD"] = 4.33
test["B365CA"] = 5.5
test["B365C>2.5"] = 1.53
test["B365C<2.5"] = 2.5

test = scaler.transform(test)

model.predict_proba(test)

array([[0.54363624, 0.45636376]])

In [23]:
1 / model.predict_proba(test)

array([[1.38681538, 3.58521264]])