In [32]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

leagues_data = {
    "England1": [
        "https://www.football-data.co.uk/mmz4281/1920/E0.csv",
        "https://www.football-data.co.uk/mmz4281/2021/E0.csv",
        "https://www.football-data.co.uk/mmz4281/2122/E0.csv",
    ],

    "Spain1": [
        "https://www.football-data.co.uk/mmz4281/1920/SP1.csv",
        "https://www.football-data.co.uk/mmz4281/2021/SP1.csv",
        "https://www.football-data.co.uk/mmz4281/2122/SP1.csv",
    ],

    "Spain2": [
        "https://www.football-data.co.uk/mmz4281/1920/SP2.csv",
        "https://www.football-data.co.uk/mmz4281/2021/SP2.csv",
        "https://www.football-data.co.uk/mmz4281/2122/SP2.csv",
    ],

    "Germany1": [
        "https://www.football-data.co.uk/mmz4281/1920/D1.csv",
        "https://www.football-data.co.uk/mmz4281/2021/D1.csv",
        "https://www.football-data.co.uk/mmz4281/2122/D1.csv",
    ],

    "Italy1": [
        "https://www.football-data.co.uk/mmz4281/1920/I1.csv",
        "https://www.football-data.co.uk/mmz4281/2021/I1.csv",
        "https://www.football-data.co.uk/mmz4281/2122/I1.csv",
    ],

    "France1": [
        "https://www.football-data.co.uk/mmz4281/1920/F1.csv",
        "https://www.football-data.co.uk/mmz4281/2021/F1.csv",
        "https://www.football-data.co.uk/mmz4281/2122/F1.csv",
    ],

    "Portugal1": [
        "https://www.football-data.co.uk/mmz4281/1920/P1.csv",
        "https://www.football-data.co.uk/mmz4281/2021/P1.csv",
        "https://www.football-data.co.uk/mmz4281/2122/P1.csv",
    ],

    "Holland1": [
        "https://www.football-data.co.uk/mmz4281/1920/N1.csv",
        "https://www.football-data.co.uk/mmz4281/2021/N1.csv",
        "https://www.football-data.co.uk/mmz4281/2122/N1.csv",
    ],

    "Belgium1": [
        "https://www.football-data.co.uk/mmz4281/1920/B1.csv",
        "https://www.football-data.co.uk/mmz4281/2021/B1.csv",
        "https://www.football-data.co.uk/mmz4281/2122/B1.csv",
    ]
}


def target(home_goals, away_goals):
    if home_goals > away_goals:
        return 1
    return 0

league_name = "Germany1"
data = leagues_data[league_name]
li = []
for season in data:
    df = pd.read_csv(season)
    li.append(df)

df = pd.concat(li)
df = df[["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "B365CH", "B365CD", "B365CA", "B365C>2.5", "B365C<2.5"]]
print(f"number of null: {df.isnull().sum().sum()}\n")
df = df.dropna()

df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
df = df.sort_values("Date")

df["target"] = np.vectorize(target)(df["FTHG"], df["FTAG"])

df = df.reset_index(drop=True)

df.tail()

number of null: 0



Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365CH,B365CD,B365CA,B365C>2.5,B365C<2.5,target
848,2022-03-19,Mainz,Bielefeld,4,0,1.45,4.33,7.0,1.86,2.04,1
849,2022-03-19,Bayern Munich,Union Berlin,4,0,1.12,9.0,17.0,1.28,3.75,1
850,2022-03-20,Wolfsburg,Leverkusen,0,2,3.1,3.6,2.15,1.8,2.0,0
851,2022-03-20,RB Leipzig,Ein Frankfurt,0,0,1.4,4.75,8.0,1.5,2.62,0
852,2022-03-20,FC Koln,Dortmund,1,1,3.4,4.0,1.95,1.5,2.62,0


In [33]:
team_data = pd.get_dummies(df[["HomeTeam", "AwayTeam"]])
team_columns = team_data.columns

odds = df[["B365CH", "B365CD", "B365CA", "B365C>2.5", "B365C<2.5"]]

x = pd.concat([team_data, odds], axis=1)
y = df["target"]

test_size_1 = int(0.7 * len(x))

x_train = x[:test_size_1]
y_train = y[:test_size_1]

x_other = x[test_size_1:]
y_other = y[test_size_1:]

test_size_2 = int(0.5 * len(x_other))

x_val = x_other[:test_size_2]
y_val = y_other[:test_size_2]

x_test = x_other[test_size_2:]
y_test = y_other[test_size_2:]

scaler = MinMaxScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_val = scaler.transform(x_val)
scaled_x_test = scaler.transform(x_test)

print(f"Train : {x_train.shape}, {y_train.shape}")
print(f"Validation : {x_val.shape}, {y_val.shape}")
print(f"Test : {x_test.shape}, {y_test.shape}\n")

Train : (597, 49), (597,)
Validation : (128, 49), (128,)
Test : (128, 49), (128,)



In [34]:
start = 0.001
end = 1
jump = 0.001

scores = []
for c in np.arange(start, end + jump, jump):
    model = LogisticRegression(C=c)
    model.fit(scaled_x_train, y_train)
    score = model.score(scaled_x_val, y_val)
    scores.append(score)

best_index = np.argmax(scores)
best_c = round((best_index + start / jump) * jump, 5)

print(f"Best C: {best_c}, Accuracy: {scores[best_index]}\n")

model = LogisticRegression(C=best_c)
model.fit(scaled_x_train, y_train)

print("******************************************** - Validation - ********************************************")
print(classification_report(y_val, model.predict(scaled_x_val)))
print(y_val.value_counts() / len(y_val))

print("*********************************************** - Test - ***********************************************")
print(classification_report(y_test, model.predict(scaled_x_test)))
print(y_test.value_counts() / len(y_test))

Best C: 0.276, Accuracy: 0.671875

******************************************** - Validation - ********************************************
              precision    recall  f1-score   support

           0       0.63      0.86      0.73        66
           1       0.76      0.47      0.58        62

    accuracy                           0.67       128
   macro avg       0.70      0.67      0.66       128
weighted avg       0.70      0.67      0.66       128

0    0.515625
1    0.484375
Name: target, dtype: float64
*********************************************** - Test - ***********************************************
              precision    recall  f1-score   support

           0       0.57      0.84      0.67        67
           1       0.62      0.30      0.40        61

    accuracy                           0.58       128
   macro avg       0.59      0.57      0.54       128
weighted avg       0.59      0.58      0.54       128

0    0.523438
1    0.476562
Name: target, dt

In [35]:
x_test

Unnamed: 0,HomeTeam_Augsburg,HomeTeam_Bayern Munich,HomeTeam_Bielefeld,HomeTeam_Bochum,HomeTeam_Dortmund,HomeTeam_Ein Frankfurt,HomeTeam_FC Koln,HomeTeam_Fortuna Dusseldorf,HomeTeam_Freiburg,HomeTeam_Greuther Furth,...,AwayTeam_Schalke 04,AwayTeam_Stuttgart,AwayTeam_Union Berlin,AwayTeam_Werder Bremen,AwayTeam_Wolfsburg,B365CH,B365CD,B365CA,B365C>2.5,B365C<2.5
725,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,3.10,3.30,2.35,2.10,1.72
726,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,3.00,3.60,2.25,1.80,2.00
727,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,2.30,3.30,3.20,2.10,1.72
728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1.95,4.00,3.60,1.44,2.75
729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3.60,3.50,2.00,1.88,2.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
848,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1.45,4.33,7.00,1.86,2.04
849,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1.12,9.00,17.00,1.28,3.75
850,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3.10,3.60,2.15,1.80,2.00
851,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1.40,4.75,8.00,1.50,2.62


In [36]:
i = 0
total = 0

while i < len(x_test):
    odd = x_test.iloc[i]["B365CH"]
    chance = model.predict_proba([x_test.iloc[i]])[0][1]
    kelly = (chance - (1 - chance) / (odd - 1)) * 100
    result = y_test.iloc[i]
    
    if kelly > 0:
        if 1 == result:
            total += kelly * (odd - 1)
        if 0 == result:
            total -= kelly
            
        print(kelly, odd, chance, result, total)
    
    i = i + 1
    
print(total)

13.48740664533778 3.1 0.41394694824261075 1 28.323553955209338
3.954466942816859 3.0 0.3596964462854457 1 36.232487840843056
67.2170521883737 2.3 0.81470507758646 1 123.61465568572885
76.12893202307929 1.95 0.8837050534457709 0 47.48572366264956
51.44256453297338 2.15 0.7402741823856716 0 -3.9568408703238163
31.361012238190806 3.2 0.5281069591375618 0 -35.317853108514626
77.17462409795291 1.9 0.8918797983587243 1 34.139308579642986
99.95798361190518 1.2 0.9999299726865086 1 54.13090530202402
20.821729766467634 2.2 0.5681185259989143 1 79.11698102178518
27.04722145872912 3.5 0.478908724705208 0 52.06975956305606
87.02402152850655 1.8 0.9423289845711402 0 -34.95426196545049
59.63406056568865 2.0 0.7981703028284433 0 -94.58832253113914
85.52754420243988 1.66 0.9424589106843996 0 -180.11586673357903
99.96008952735713 1.2 0.9999334825455952 1 -160.1238488281076
51.72536493696391 2.2 0.7366838087470758 0 -211.8492137650715
62.4748366354528 1.95 0.8171851015573341 1 -152.49811896139136
78.898