In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def target(FTHG, FTAG):
    if (FTHG > FTAG):
        return 1
    return 0

leauge = "I1"

data = [
    f"https://www.football-data.co.uk/mmz4281/1920/{leauge}.csv",
    f"https://www.football-data.co.uk/mmz4281/2021/{leauge}.csv",
    f"https://www.football-data.co.uk/mmz4281/2122/{leauge}.csv"
]

li = []

for leauge in data:
    df = pd.read_csv(leauge)
    li.append(df)

df = pd.concat(li)

df = df[["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "B365CH", "B365CD", "B365CA", "B365C>2.5", "B365C<2.5"]]
df["target"] = np.vectorize(target)(df["FTHG"], df["FTAG"])

print(df.isnull().sum().sum())
df = df.dropna()

df = df.reset_index(drop=True)

df.tail()

2


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365CH,B365CD,B365CA,B365C>2.5,B365C<2.5,target
1039,13/03/2022,Verona,Napoli,1,2,3.4,3.5,2.1,2.02,1.88,0
1040,13/03/2022,Atalanta,Genoa,0,0,1.33,5.25,8.5,1.66,2.2,0
1041,13/03/2022,Udinese,Roma,1,1,3.1,3.4,2.3,2.0,1.9,0
1042,13/03/2022,Torino,Inter,1,1,5.0,3.5,1.75,1.84,2.06,0
1043,14/03/2022,Lazio,Venezia,1,0,1.33,5.0,9.0,1.57,2.37,1


In [3]:
team_data = pd.get_dummies(df[["HomeTeam", "AwayTeam"]])
odds = df[["B365CH", "B365CD", "B365CA", "B365C>2.5", "B365C<2.5"]]

X = pd.concat([team_data, odds], axis=1)
y = df["target"]

# test_size_1 = int(0.8 * len(X))
test_size_1 = -200

X_train = X[:test_size_1]
y_train = y[:test_size_1]

X_other = X[test_size_1:]
y_other = y[test_size_1:]

print(X_train.shape, y_train.shape, X_other.shape, y_other.shape)

test_size_2 = int(0.5 * len(X_other))

X_val = X_other[:test_size_2]
y_val = y_other[:test_size_2]

X_test = X_other[test_size_2:]
y_test = y_other[test_size_2:]

print(X_val.shape, y_val.shape, X_test.shape, y_test.shape)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

(844, 57) (844,) (200, 57) (200,)
(100, 57) (100,) (100, 57) (100,)


In [5]:
start = 0.001
end = 0.2
jump = 0.001

scores = []
for c in np.arange(start, end + jump, jump):
    model = LogisticRegression(random_state=101, C=c)
    model.fit(X_train, y_train);
    score = model.score(X_val, y_val)
    scores.append(score)
    
print(np.argmax(scores) + (start / jump))
print((np.argmax(scores) + (start / jump)) * jump)

70.0
0.07


In [6]:
model = LogisticRegression(random_state=101, C=0.07)
model.fit(X_train, y_train);

print(classification_report(y_val, model.predict(X_val)))

y_val.value_counts() / len(X_val)

              precision    recall  f1-score   support

           0       0.72      0.92      0.81        60
           1       0.79      0.47      0.59        40

    accuracy                           0.74       100
   macro avg       0.76      0.70      0.70       100
weighted avg       0.75      0.74      0.72       100



0    0.6
1    0.4
Name: target, dtype: float64

In [7]:
print(classification_report(y_test, model.predict(X_test)))

y_test.value_counts() / len(X_test)

              precision    recall  f1-score   support

           0       0.75      0.84      0.79        68
           1       0.54      0.41      0.46        32

    accuracy                           0.70       100
   macro avg       0.65      0.62      0.63       100
weighted avg       0.68      0.70      0.69       100



0    0.68
1    0.32
Name: target, dtype: float64

In [8]:
test = pd.DataFrame(columns=team_data.columns, data=np.zeros(len(team_data.columns)).reshape(1, len(team_data.columns)))

In [9]:
test["HomeTeam_Napoli"] = 1
test["AwayTeam_Udinese"] = 1

test["B365CH"] = 1.53
test["B365CD"] = 4.2
test["B365CA"] = 6
test["B365C>2.5"] = 1.84
test["B365C<2.5"] = 2.06

test = scaler.transform(test)

model.predict_proba(test)

array([[0.47248872, 0.52751128]])

In [13]:
1 / model.predict_proba(test)

array([[2.27288584, 1.7856164 ]])

In [None]:
array([[0.5984822 , 0.17946433, 0.22205347]])