In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score

In [28]:
df_results = pd.read_csv("files/input/kaggle/results.csv")

In [29]:
df_results['date'] = pd.to_datetime(df_results['date'])

In [30]:
# Neue Spalten initialisieren
df_results['home_team_goals_letzte_Begegnung'] = None
df_results['home_team_goals_vorletzte_Begegnung'] = None
df_results['home_team_goals_vorvorletzte_Begegnung'] = None
df_results['away_team_goals_letzte_Begegnung'] = None
df_results['away_team_goals_vorletzte_Begegnung'] = None
df_results['away_team_goals_vorvorletzte_Begegnung'] = None

# Verwenden eines Wörterbuchs, um die letzten Begegnungen zu speichern
last_encounters = {}

# Für jedes Spiel die Anzahl der Tore des Heim- und Auswärtsteams in den letzten drei Begegnungen herausfinden
for i, row in df_results.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']
    current_date = row['date']
    
    # Erstellen eines Schlüssels für die Paarung der Teams
    match_key = tuple(sorted([home_team, away_team]))
    
    if match_key in last_encounters:
        previous_matches = last_encounters[match_key]
        
        if len(previous_matches) >= 1:
            last_match = previous_matches[-1]
            if last_match['home_team'] == home_team:
                df_results.at[i, 'home_team_goals_letzte_Begegnung'] = last_match['home_score']
                df_results.at[i, 'away_team_goals_letzte_Begegnung'] = last_match['away_score']
            else:
                df_results.at[i, 'home_team_goals_letzte_Begegnung'] = last_match['away_score']
                df_results.at[i, 'away_team_goals_letzte_Begegnung'] = last_match['home_score']
                
        if len(previous_matches) >= 2:
            second_last_match = previous_matches[-2]
            if second_last_match['home_team'] == home_team:
                df_results.at[i, 'home_team_goals_vorletzte_Begegnung'] = second_last_match['home_score']
                df_results.at[i, 'away_team_goals_vorletzte_Begegnung'] = second_last_match['away_score']
            else:
                df_results.at[i, 'home_team_goals_vorletzte_Begegnung'] = second_last_match['away_score']
                df_results.at[i, 'away_team_goals_vorletzte_Begegnung'] = second_last_match['home_score']
                
        if len(previous_matches) >= 3:
            third_last_match = previous_matches[-3]
            if third_last_match['home_team'] == home_team:
                df_results.at[i, 'home_team_goals_vorvorletzte_Begegnung'] = third_last_match['home_score']
                df_results.at[i, 'away_team_goals_vorvorletzte_Begegnung'] = third_last_match['away_score']
            else:
                df_results.at[i, 'home_team_goals_vorvorletzte_Begegnung'] = third_last_match['away_score']
                df_results.at[i, 'away_team_goals_vorvorletzte_Begegnung'] = third_last_match['home_score']
                
    # Aktualisieren der letzten Begegnung für das Team-Paar
    if match_key not in last_encounters:
        last_encounters[match_key] = []
    last_encounters[match_key].append(row)

In [31]:

# Neue Spalten initialisieren
df_results['home_team_geschossen_letztes_Spiel'] = None
df_results['home_team_kassiert_letztes_Spiel'] = None
df_results['home_team_geschossen_vorletztes_Spiel'] = None
df_results['home_team_kassiert_vorletztes_Spiel'] = None
df_results['home_team_geschossen_vorvorletztes_Spiel'] = None
df_results['home_team_kassiert_vorvorletztes_Spiel'] = None

df_results['away_team_geschossen_letztes_Spiel'] = None
df_results['away_team_kassiert_letztes_Spiel'] = None
df_results['away_team_geschossen_vorletztes_Spiel'] = None
df_results['away_team_kassiert_vorletztes_Spiel'] = None
df_results['away_team_geschossen_vorvorletztes_Spiel'] = None
df_results['away_team_kassiert_vorvorletztes_Spiel'] = None

# Verwenden von Wörterbüchern, um die letzten Spiele zu speichern
last_matches = {}

# Funktion zum Aktualisieren der letzten Spiele eines Teams
def update_last_matches(team, scored, conceded, last_matches):
    if team not in last_matches:
        last_matches[team] = []
    last_matches[team].append((scored, conceded))
    if len(last_matches[team]) > 3:
        last_matches[team].pop(0)

# Für jedes Spiel die Tore aus den letzten drei Spielen finden und die neuen Spalten füllen
for i, row in df_results.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']
    home_score = row['home_score']
    away_score = row['away_score']
    
    # Home Team letzte Spiele
    if home_team in last_matches:
        previous_matches = last_matches[home_team]
        if len(previous_matches) >= 1:
            df_results.at[i, 'home_team_geschossen_letztes_Spiel'] = previous_matches[-1][0]
            df_results.at[i, 'home_team_kassiert_letztes_Spiel'] = previous_matches[-1][1]
        if len(previous_matches) >= 2:
            df_results.at[i, 'home_team_geschossen_vorletztes_Spiel'] = previous_matches[-2][0]
            df_results.at[i, 'home_team_kassiert_vorletztes_Spiel'] = previous_matches[-2][1]
        if len(previous_matches) >= 3:
            df_results.at[i, 'home_team_geschossen_vorvorletztes_Spiel'] = previous_matches[-3][0]
            df_results.at[i, 'home_team_kassiert_vorvorletztes_Spiel'] = previous_matches[-3][1]
    
    # Away Team letzte Spiele
    if away_team in last_matches:
        previous_matches = last_matches[away_team]
        if len(previous_matches) >= 1:
            df_results.at[i, 'away_team_geschossen_letztes_Spiel'] = previous_matches[-1][0]
            df_results.at[i, 'away_team_kassiert_letztes_Spiel'] = previous_matches[-1][1]
        if len(previous_matches) >= 2:
            df_results.at[i, 'away_team_geschossen_vorletztes_Spiel'] = previous_matches[-2][0]
            df_results.at[i, 'away_team_kassiert_vorletztes_Spiel'] = previous_matches[-2][1]
        if len(previous_matches) >= 3:
            df_results.at[i, 'away_team_geschossen_vorvorletztes_Spiel'] = previous_matches[-3][0]
            df_results.at[i, 'away_team_kassiert_vorvorletztes_Spiel'] = previous_matches[-3][1]
    
    # Aktualisieren der letzten Spiele für Home und Away Team
    update_last_matches(home_team, home_score, away_score, last_matches)
    update_last_matches(away_team, away_score, home_score, last_matches)

In [32]:
df_results

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_team_goals_letzte_Begegnung,...,home_team_geschossen_vorletztes_Spiel,home_team_kassiert_vorletztes_Spiel,home_team_geschossen_vorvorletztes_Spiel,home_team_kassiert_vorvorletztes_Spiel,away_team_geschossen_letztes_Spiel,away_team_kassiert_letztes_Spiel,away_team_geschossen_vorletztes_Spiel,away_team_kassiert_vorletztes_Spiel,away_team_geschossen_vorvorletztes_Spiel,away_team_kassiert_vorvorletztes_Spiel
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False,,...,,,,,,,,,,
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False,0.0,...,,,,,0.0,0.0,,,,
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False,2.0,...,0.0,0.0,,,4.0,2.0,0.0,0.0,,
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False,1.0,...,4.0,2.0,0.0,0.0,2.0,1.0,2.0,4.0,0.0,0.0
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False,2.0,...,2.0,1.0,2.0,4.0,2.0,2.0,1.0,2.0,4.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47121,2024-07-06,,,,,UEFA Euro,Düsseldorf,Germany,True,,...,,,,,,,,,,
47122,2024-07-06,,,,,UEFA Euro,Berlin,Germany,True,,...,,,,,,,,,,
47123,2024-07-09,,,,,UEFA Euro,Munich,Germany,True,,...,,,,,,,,,,
47124,2024-07-10,,,,,UEFA Euro,Dortmund,Germany,True,,...,,,,,,,,,,


In [33]:
df_model = pd.read_csv("files/temp/results_fifa_ranking.csv")
df_model = df_model[df_model["home_score"] < 5]
df_model

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_win,home_team_strength,away_team_strength,home_team_form_goals,home_team_form_points,away_team_form_goals,away_team_form_points,home_advantage
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False,draw,1497.46,1794.90,0.0,0,0.0,0,1
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False,True,1794.90,1497.46,0.0,1,0.0,1,1
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False,True,1497.46,1794.90,0.4,1,0.8,4,1
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False,draw,1794.90,1497.46,1.0,4,0.8,4,1
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False,True,1497.46,1794.90,1.2,5,1.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47070,2024-03-26,Scotland,Northern Ireland,0.0,1.0,Friendly,Glasgow,Scotland,False,False,1497.46,1341.05,1.2,2,1.2,7,1
47071,2024-03-26,Senegal,Benin,1.0,0.0,Friendly,Amiens,France,True,True,1624.73,1225.68,2.4,13,1.0,3,0
47072,2024-03-26,Slovenia,Portugal,2.0,0.0,Friendly,Ljubljana,Slovenia,False,True,1427.84,1748.11,1.4,10,3.4,15,1
47073,2024-03-26,Spain,Brazil,3.0,3.0,Friendly,Madrid,Spain,False,draw,1727.50,1788.65,1.8,12,0.6,4,1


In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
import statsmodels.api as sm

# Assume df_model is already defined and loaded with data

# Define features and target variable
features = ['home_team_strength', 'away_team_strength', 'home_team_form_goals', 'home_team_form_points', 'away_team_form_goals', 'away_team_form_points', 'home_advantage']
# If additional features are to be included based on 'home_team_' or 'away_team_', they should be added here
# features.extend([col for col in df_model.columns if 'home_team_' in col or 'away_team_' in col])

# Drop rows with missing values
df_model = df_model.dropna(how="any")

# Convert features to numeric
X = df_model[features].apply(pd.to_numeric, errors='raise')

# Convert target variable to int
y = df_model["home_score"].astype(int)  # 1 for home win, 0 for away win or draw

# Add constant to X
X = sm.add_constant(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to numpy arrays and ensure correct dtype
X_train = np.asarray(X_train).astype(np.float64)
X_test = np.asarray(X_test).astype(np.float64)
y_train = np.asarray(y_train).astype(np.float64)
y_test = np.asarray(y_test).astype(np.float64)

# Train the model
log_reg = sm.OLS(y_train, X_train)
model = log_reg.fit()

# Make predictions on the test set
y_pred = model.predict(X_test)
df = pd.DataFrame(data=y_pred,columns=["y_pred"])
df['rounded_scores'] = df['y_pred'].apply(lambda x: round(x, 0))

# Access the rounded scores
rounded_scores = df['rounded_scores']

# Calculate evaluation metrics
accuracy = mean_squared_error(y_test, rounded_scores)

accuracy

1.3331007535584705

In [35]:
def rsquared_robust(y, y_pred, weights):
    numerator = np.sum(weights * (y - y_pred) ** 2)
    yw_bar = np.sum(weights * y)  / np.sum(weights)
    denominator = np.sum(weights * (y -yw_bar) ** 2)
    r2_robust = 1 - (numerator / denominator)
    return r2_robust

def adj_r2_robust(x,y,r_squared):
    n = len(x)
    q = x.shape[1]
    if n-q <= 0:
        return pd.NA
    
    r2_adj_robust = 1-(1-r_squared) * ((n-1) / (n-q))

    return r2_adj_robust

In [36]:
display(y)

0        0
1        4
2        2
3        2
4        3
        ..
47070    0
47071    1
47072    2
47073    3
47074    2
Name: home_score, Length: 35828, dtype: int32

In [37]:
from statsmodels.robust.norms import HuberT
import math
model = sm.RLM(y, X, M=HuberT())
result = model.fit()

y_pred = result.predict(X)

rsquard = rsquared_robust(y.squeeze(), y_pred, result.weights)
adj_r2 = adj_r2_robust(X,y,rsquard)
condition_nr = np.linalg.cond(X.mul(np.sqrt(result.weights), axis=0))
jb, jb_pv, skew, kurtosis = sm.stats.jarque_bera(result.resid, axis=0)


In [38]:
display(rsquard,adj_r2, condition_nr, jb,jb_pv,skew,kurtosis,result.summary())

0.1110963664327439

0.1109226555049111

12204.264730028373

1629.9658202391058

0.0

0.4840953443332647

2.606976670415397

0,1,2,3
Dep. Variable:,home_score,No. Observations:,35828.0
Model:,RLM,Df Residuals:,35820.0
Method:,IRLS,Df Model:,7.0
Norm:,HuberT,,
Scale Est.:,mad,,
Cov Type:,H1,,
Date:,"Thu, 04 Jul 2024",,
Time:,14:13:11,,
No. Iterations:,18,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.2140,0.038,31.736,0.000,1.139,1.289
home_team_strength,0.0008,2.53e-05,32.860,0.000,0.001,0.001
away_team_strength,-0.0008,2.49e-05,-32.960,0.000,-0.001,-0.001
home_team_form_goals,0.1548,0.010,15.263,0.000,0.135,0.175
home_team_form_points,0.0101,0.002,4.186,0.000,0.005,0.015
away_team_form_goals,0.0705,0.010,7.149,0.000,0.051,0.090
away_team_form_points,-0.0552,0.002,-22.683,0.000,-0.060,-0.050
home_advantage,0.1671,0.014,11.606,0.000,0.139,0.195


In [39]:
df_results_new = pd.read_csv("files/temp/results_fifa_ranking.csv")

In [40]:
df_results_new['date'] = pd.to_datetime(df_results_new['date'])

In [41]:
# Neue Spalten initialisieren
df_results_new['home_team_goals_letzte_Begegnung'] = None
df_results_new['home_team_goals_vorletzte_Begegnung'] = None
df_results_new['home_team_goals_vorvorletzte_Begegnung'] = None
df_results_new['away_team_goals_letzte_Begegnung'] = None
df_results_new['away_team_goals_vorletzte_Begegnung'] = None
df_results_new['away_team_goals_vorvorletzte_Begegnung'] = None

# Verwenden eines Wörterbuchs, um die letzten Begegnungen zu speichern
last_encounters = {}

# Für jedes Spiel die Anzahl der Tore des Heim- und Auswärtsteams in den letzten drei Begegnungen herausfinden
for i, row in df_results_new.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']
    current_date = row['date']
    
    # Erstellen eines Schlüssels für die Paarung der Teams
    match_key = tuple(sorted([home_team, away_team]))
    
    if match_key in last_encounters:
        previous_matches = last_encounters[match_key]
        
        if len(previous_matches) >= 1:
            last_match = previous_matches[-1]
            if last_match['home_team'] == home_team:
                df_results_new.at[i, 'home_team_goals_letzte_Begegnung'] = last_match['home_score']
                df_results_new.at[i, 'away_team_goals_letzte_Begegnung'] = last_match['away_score']
            else:
                df_results_new.at[i, 'home_team_goals_letzte_Begegnung'] = last_match['away_score']
                df_results_new.at[i, 'away_team_goals_letzte_Begegnung'] = last_match['home_score']
                
        if len(previous_matches) >= 2:
            second_last_match = previous_matches[-2]
            if second_last_match['home_team'] == home_team:
                df_results_new.at[i, 'home_team_goals_vorletzte_Begegnung'] = second_last_match['home_score']
                df_results_new.at[i, 'away_team_goals_vorletzte_Begegnung'] = second_last_match['away_score']
            else:
                df_results_new.at[i, 'home_team_goals_vorletzte_Begegnung'] = second_last_match['away_score']
                df_results_new.at[i, 'away_team_goals_vorletzte_Begegnung'] = second_last_match['home_score']
                
        if len(previous_matches) >= 3:
            third_last_match = previous_matches[-3]
            if third_last_match['home_team'] == home_team:
                df_results_new.at[i, 'home_team_goals_vorvorletzte_Begegnung'] = third_last_match['home_score']
                df_results_new.at[i, 'away_team_goals_vorvorletzte_Begegnung'] = third_last_match['away_score']
            else:
                df_results_new.at[i, 'home_team_goals_vorvorletzte_Begegnung'] = third_last_match['away_score']
                df_results_new.at[i, 'away_team_goals_vorvorletzte_Begegnung'] = third_last_match['home_score']
                
    # Aktualisieren der letzten Begegnung für das Team-Paar
    if match_key not in last_encounters:
        last_encounters[match_key] = []
    last_encounters[match_key].append(row)

In [42]:
# Neue Spalten initialisieren
df_results_new['home_team_geschossen_letztes_Spiel'] = None
df_results_new['home_team_kassiert_letztes_Spiel'] = None
df_results_new['home_team_geschossen_vorletztes_Spiel'] = None
df_results_new['home_team_kassiert_vorletztes_Spiel'] = None
df_results_new['home_team_geschossen_vorvorletztes_Spiel'] = None
df_results_new['home_team_kassiert_vorvorletztes_Spiel'] = None

df_results_new['away_team_geschossen_letztes_Spiel'] = None
df_results_new['away_team_kassiert_letztes_Spiel'] = None
df_results_new['away_team_geschossen_vorletztes_Spiel'] = None
df_results_new['away_team_kassiert_vorletztes_Spiel'] = None
df_results_new['away_team_geschossen_vorvorletztes_Spiel'] = None
df_results_new['away_team_kassiert_vorvorletztes_Spiel'] = None

# Verwenden von Wörterbüchern, um die letzten Spiele zu speichern
last_matches = {}

# Funktion zum Aktualisieren der letzten Spiele eines Teams
def update_last_matches(team, scored, conceded, last_matches):
    if team not in last_matches:
        last_matches[team] = []
    last_matches[team].append((scored, conceded))
    if len(last_matches[team]) > 3:
        last_matches[team].pop(0)

# Für jedes Spiel die Tore aus den letzten drei Spielen finden und die neuen Spalten füllen
for i, row in df_results_new.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']
    home_score = row['home_score']
    away_score = row['away_score']
    
    # Home Team letzte Spiele
    if home_team in last_matches:
        previous_matches = last_matches[home_team]
        if len(previous_matches) >= 1:
            df_results_new.at[i, 'home_team_geschossen_letztes_Spiel'] = previous_matches[-1][0]
            df_results_new.at[i, 'home_team_kassiert_letztes_Spiel'] = previous_matches[-1][1]
        if len(previous_matches) >= 2:
            df_results_new.at[i, 'home_team_geschossen_vorletztes_Spiel'] = previous_matches[-2][0]
            df_results_new.at[i, 'home_team_kassiert_vorletztes_Spiel'] = previous_matches[-2][1]
        if len(previous_matches) >= 3:
            df_results_new.at[i, 'home_team_geschossen_vorvorletztes_Spiel'] = previous_matches[-3][0]
            df_results_new.at[i, 'home_team_kassiert_vorvorletztes_Spiel'] = previous_matches[-3][1]
    
    # Away Team letzte Spiele
    if away_team in last_matches:
        previous_matches = last_matches[away_team]
        if len(previous_matches) >= 1:
            df_results_new.at[i, 'away_team_geschossen_letztes_Spiel'] = previous_matches[-1][0]
            df_results_new.at[i, 'away_team_kassiert_letztes_Spiel'] = previous_matches[-1][1]
        if len(previous_matches) >= 2:
            df_results_new.at[i, 'away_team_geschossen_vorletztes_Spiel'] = previous_matches[-2][0]
            df_results_new.at[i, 'away_team_kassiert_vorletztes_Spiel'] = previous_matches[-2][1]
        if len(previous_matches) >= 3:
            df_results_new.at[i, 'away_team_geschossen_vorvorletztes_Spiel'] = previous_matches[-3][0]
            df_results_new.at[i, 'away_team_kassiert_vorvorletztes_Spiel'] = previous_matches[-3][1]
    
    # Aktualisieren der letzten Spiele für Home und Away Team
    update_last_matches(home_team, home_score, away_score, last_matches)
    update_last_matches(away_team, away_score, home_score, last_matches)

In [43]:
df_results_new.to_csv("files/temp/fifa&head2head.csv")

In [44]:
df_results_new

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_win,...,home_team_geschossen_vorletztes_Spiel,home_team_kassiert_vorletztes_Spiel,home_team_geschossen_vorvorletztes_Spiel,home_team_kassiert_vorvorletztes_Spiel,away_team_geschossen_letztes_Spiel,away_team_kassiert_letztes_Spiel,away_team_geschossen_vorletztes_Spiel,away_team_kassiert_vorletztes_Spiel,away_team_geschossen_vorvorletztes_Spiel,away_team_kassiert_vorvorletztes_Spiel
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False,draw,...,,,,,,,,,,
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False,True,...,,,,,0.0,0.0,,,,
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False,True,...,0.0,0.0,,,4.0,2.0,0.0,0.0,,
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False,draw,...,4.0,2.0,0.0,0.0,2.0,1.0,2.0,4.0,0.0,0.0
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False,True,...,2.0,1.0,2.0,4.0,2.0,2.0,1.0,2.0,4.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47121,2024-07-06,,,,,UEFA Euro,Düsseldorf,Germany,True,False,...,,,,,,,,,,
47122,2024-07-06,,,,,UEFA Euro,Berlin,Germany,True,False,...,,,,,,,,,,
47123,2024-07-09,,,,,UEFA Euro,Munich,Germany,True,False,...,,,,,,,,,,
47124,2024-07-10,,,,,UEFA Euro,Dortmund,Germany,True,False,...,,,,,,,,,,


In [45]:
df_results_new.columns

Index(['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral', 'home_win',
       'home_team_strength', 'away_team_strength', 'home_team_form_goals',
       'home_team_form_points', 'away_team_form_goals',
       'away_team_form_points', 'home_advantage',
       'home_team_goals_letzte_Begegnung',
       'home_team_goals_vorletzte_Begegnung',
       'home_team_goals_vorvorletzte_Begegnung',
       'away_team_goals_letzte_Begegnung',
       'away_team_goals_vorletzte_Begegnung',
       'away_team_goals_vorvorletzte_Begegnung',
       'home_team_geschossen_letztes_Spiel',
       'home_team_kassiert_letztes_Spiel',
       'home_team_geschossen_vorletztes_Spiel',
       'home_team_kassiert_vorletztes_Spiel',
       'home_team_geschossen_vorvorletztes_Spiel',
       'home_team_kassiert_vorvorletztes_Spiel',
       'away_team_geschossen_letztes_Spiel',
       'away_team_kassiert_letztes_Spiel',
       'away_team_geschossen_vorletztes_S

In [46]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import mean_squared_error, mean_absolute_error



df = df_results_new
df.dropna(inplace=True)

# Feature und Zielvariablen aufteilen
X = df[[
       'home_team_strength', 'away_team_strength', 'home_team_form_goals',
       'home_team_form_points', 'away_team_form_goals',
       'away_team_form_points', 'home_advantage',
       'year',
       'home_team_encoded', 'away_team_encoded',
      ]].values

y = df[['home_score']].values

# Daten skalieren
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Funktionen zur Erstellung des Modells mit variabler Anzahl von LSTM-Schichten
def create_model(num_layers=1, units=50, input_shape=(X_train.shape[1], 1)):
    model = Sequential()
    model.add(LSTM(units=units, return_sequences=(num_layers > 1), input_shape=input_shape))
    
    for _ in range(1, num_layers):
        model.add(LSTM(units=units, return_sequences=(_ < num_layers - 1)))
        
    model.add(Dense(units=1))
    
    model.compile(optimizer='adam', loss='mse')
    return model

# Modell erstellen
num_layers = 10  # Hier können Sie die Anzahl der LSTM-Schichten anpassen
model = create_model(num_layers=num_layers, units=50, input_shape=(X_train.shape[1], 1))

# Checkpointing
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True, mode='min')

# Modell trainieren
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[checkpoint])

# Vorhersagen machen
model.load_weights('best_model.keras')  # Laden der besten Modellgewichte
y_pred = model.predict(X_test)

# Performance Metriken
mse_home = mean_squared_error(y_test, y_pred)
mae_home = mean_absolute_error(y_test, y_pred)

print(f"Home Score - MSE: {mse_home}, MAE: {mae_home}")


Exception ignored in: 'zmq.backend.cython._zmq.Frame.__del__'
Traceback (most recent call last):
  File "_zmq.py", line 141, in zmq.backend.cython._zmq._check_rc
KeyboardInterrupt: 


KeyError: "['year', 'home_team_encoded', 'away_team_encoded'] not in index"

In [None]:
df_em2024 = pd.read_pickle("df_em2024.pkl")

In [None]:
'home_team_strength', 'away_team_strength', 'home_team_form_goals',
       'home_team_form_points', 'away_team_form_goals',
       'away_team_form_points', 'home_advantage',
       'year',
       'home_team_encoded', 'away_team_encoded',

In [None]:
df_em2024.drop(columns=['away_score','home_score','winning_continent','away_continent','winning_country','home_continent',"city","country",'tournament',"neutral"])

Unnamed: 0,date,home_team,away_team
47075,2024-06-14,Germany,Scotland
47076,2024-06-15,Hungary,Switzerland
47077,2024-06-15,Spain,Croatia
47078,2024-06-15,Italy,Albania
47079,2024-06-16,Slovenia,Denmark
47080,2024-06-16,Serbia,England
47081,2024-06-16,Poland,Netherlands
47082,2024-06-17,Austria,France
47083,2024-06-17,Romania,Ukraine
47084,2024-06-17,Belgium,Slovakia


In [None]:
df_em2024.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,winning_country,home_continent,away_continent,winning_continent
47075,2024-06-14,Germany,Scotland,,,UEFA Euro,Munich,Germany,False,Scotland,Europe,Europe,Europe
47076,2024-06-15,Hungary,Switzerland,,,UEFA Euro,Cologne,Germany,True,Switzerland,Europe,Europe,Europe
47077,2024-06-15,Spain,Croatia,,,UEFA Euro,Berlin,Germany,True,Croatia,Europe,Europe,Europe
47078,2024-06-15,Italy,Albania,,,UEFA Euro,Dortmund,Germany,True,Albania,Europe,Europe,Europe
47079,2024-06-16,Slovenia,Denmark,,,UEFA Euro,Stuttgart,Germany,True,Denmark,Europe,Europe,Europe


In [None]:
mse_home = mean_squared_error(y_test, y_pred)
mae_home = mean_absolute_error(y_test, y_pred)
mse_away = mean_squared_error(y_test[:, 1], y_pred[:, 1])
mae_away = mean_absolute_error(y_test[:, 1], y_pred[:, 1])

print(f"Home Score - MSE: {mse_home}, MAE: {mae_home}")
print(f"Away Score - MSE: {mse_away}, MAE: {mae_away}")

ValueError: y_true and y_pred have different number of output (1!=2)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Sequential
from keras.layers import LSTM, Dense



df = df_results_new

# Entfernen aller Zeilen mit mindestens einem Nullwert
df.dropna(inplace=True)

# 1. Datenvorbereitung
# Datum in DateTime konvertieren und als numerische Features extrahieren
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Teams als numerische Werte kodieren
le_home = LabelEncoder()
le_away = LabelEncoder()
df['home_team_encoded'] = le_home.fit_transform(df['home_team'])
df['away_team_encoded'] = le_away.fit_transform(df['away_team'])

# Feature und Zielvariablen aufteilen
X = df[['neutral',
       'home_team_strength', 'away_team_strength', 'home_team_form_goals',
       'home_team_form_points', 'away_team_form_goals',
       'away_team_form_points', 'home_advantage',
       'home_team_goals_letzte_Begegnung',
       'home_team_goals_vorletzte_Begegnung',
       'home_team_goals_vorvorletzte_Begegnung',
       'away_team_goals_letzte_Begegnung',
       'away_team_goals_vorletzte_Begegnung',
       'away_team_goals_vorvorletzte_Begegnung', 'year', 'month', 'day',
       'home_team_encoded', 'away_team_encoded',
       'home_team_geschossen_letztes_Spiel',
       'home_team_kassiert_letztes_Spiel',
       'home_team_geschossen_vorletztes_Spiel',
       'home_team_kassiert_vorletztes_Spiel',
       'home_team_geschossen_vorvorletztes_Spiel',
       'home_team_kassiert_vorvorletztes_Spiel',
       'away_team_geschossen_letztes_Spiel',
       'away_team_kassiert_letztes_Spiel',
       'away_team_geschossen_vorletztes_Spiel',
       'away_team_kassiert_vorletztes_Spiel',
       'away_team_geschossen_vorvorletztes_Spiel',
       'away_team_kassiert_vorvorletztes_Spiel']]

y = df[['home_score', 'away_score']]

# Normalisierung der Features
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# LSTM Eingabeformat erstellen
def create_sequences(data, target, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length]
        y = target[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

seq_length = 3
X, y = create_sequences(features_scaled, targets.values, seq_length)

# Aufteilen in Trainings- und Testdaten
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# 2. Modellaufbau
model = Sequential()
model.add(LSTM(64, input_shape=(seq_length, X.shape[2]), return_sequences=True))
model.add(LSTM(32))
model.add(Dense(2))

model.compile(optimizer='adam', loss='mean_squared_error')

# 3. Training
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_split=0.2)

# 4. Vorhersage und Bewertung
y_pred = model.predict(X_test)

# Performance Metriken
mse_home = mean_squared_error(y_test[:, 0], y_pred[:, 0])
mae_home = mean_absolute_error(y_test[:, 0], y_pred[:, 0])
mse_away = mean_squared_error(y_test[:, 1], y_pred[:, 1])
mae_away = mean_absolute_error(y_test[:, 1], y_pred[:, 1])

print(f"Home Score - MSE: {mse_home}, MAE: {mae_home}")
print(f"Away Score - MSE: {mse_away}, MAE: {mae_away}")


ValueError: could not convert string to float: 'home_team_strength'

In [119]:
df_shootouts = pd.read_csv("files/input/kaggle/shootouts.csv")

In [None]:
continents = {
    "Afghanistan": "Asia",
    "Egypt": "Africa",
    "Albania": "Europe",
    "Algeria": "Africa",
    "American Samoa": "Oceania",
    "Andorra": "Europe",
    "Angola": "Africa",
    "Anguilla": "North America",
    "Antigua and Barbuda": "North America",
    "Equatorial Guinea": "Africa",
    "Argentina": "South America",
    "Armenia": "Asia",
    "Aruba": "North America",
    "Azerbaijan": "Asia",
    "Ethiopia": "Africa",
    "Australia": "Oceania",
    "Bahamas": "North America",
    "Bahrain": "Asia",
    "Bangladesh": "Asia",
    "Barbados": "North America",
    "Belarus": "Europe",
    "Belgium": "Europe",
    "Belize": "North America",
    "Benin": "Africa",
    "Bermuda": "North America",
    "Bhutan": "Asia",
    "Bolivia": "South America",
    "Bosnia and Herzegovina": "Europe",
    "Botswana": "Africa",
    "Brazil": "South America",
    "Brunei Darussalam": "Asia",
    "Bulgaria": "Europe",
    "Burkina Faso": "Africa",
    "Burundi": "Africa",
    "Cayman Islands": "North America",
    "Chile": "South America",
    "China": "Asia",
    "Cook Islands": "Oceania",
    "Costa Rica": "North America",
    "Curaçao": "North America",
    "Denmark": "Europe",
    "Germany": "Europe",
    "Dominica": "North America",
    "Dominican Republic": "North America",
    "Djibouti": "Africa",
    "Ecuador": "South America",
    "El Salvador": "North America",
    "Ivory Coast": "Africa",
    "England": "Europe",
    "Eritrea": "Africa",
    "Estonia": "Europe",
    "Eswatini": "Africa",
    "Faroe Islands": "Europe",
    "Fiji": "Oceania",
    "Finland": "Europe",
    "France": "Europe",
    "Gabon": "Africa",
    "Gambia": "Africa",
    "Georgia": "Asia",
    "Ghana": "Africa",
    "Gibraltar": "Europe",
    "Grenada": "North America",
    "Greece": "Europe",
    "Guam": "Oceania",
    "Guatemala": "North America",
    "Guyana": "South America",
    "Guinea": "Africa",
    "Guinea-Bissau": "Africa",
    "Haiti": "North America",
    "Honduras": "North America",
    "Hong Kong": "Asia",
    "India": "Asia",
    "Indonesia": "Asia",
    "Iraq": "Asia",
    "Iran": "Asia",
    "Ireland": "Europe",
    "Iceland": "Europe",
    "Israel": "Asia",
    "Italy": "Europe",
    "Jamaica": "North America",
    "Japan": "Asia",
    "Yemen": "Asia",
    "Jordan": "Asia",
    "U.S. Virgin Islands": "North America",
    "British Virgin Islands": "North America",
    "Cambodia": "Asia",
    "Cameroon": "Africa",
    "Canada": "North America",
    "Cape Verde": "Africa",
    "Kazakhstan": "Asia",
    "Qatar": "Asia",
    "Kenya": "Africa",
    "Kyrgyzstan": "Asia",
    "Colombia": "South America",
    "Comoros": "Africa",
    "Kosovo": "Europe",
    "Democratic Republic of the Congo": "Africa",
    "Republic of the Congo": "Africa",
    "North Korea": "Asia",
    "South Korea": "Asia",
    "Croatia": "Europe",
    "Cuba": "North America",
    "Kuwait": "Asia",
    "Laos": "Asia",
    "Lesotho": "Africa",
    "Latvia": "Europe",
    "Lebanon": "Asia",
    "Liberia": "Africa",
    "Libya": "Africa",
    "Liechtenstein": "Europe",
    "Lithuania": "Europe",
    "Luxembourg": "Europe",
    "Macau": "Asia",
    "Madagascar": "Africa",
    "Malawi": "Africa",
    "Malaysia": "Asia",
    "Maldives": "Asia",
    "Mali": "Africa",
    "Malta": "Europe",
    "Morocco": "Africa",
    "Mauritania": "Africa",
    "Mauritius": "Africa",
    "Mexico": "North America",
    "Moldova": "Europe",
    "Mongolia": "Asia",
    "Montenegro": "Europe",
    "Montserrat": "North America",
    "Mozambique": "Africa",
    "Myanmar": "Asia",
    "Namibia": "Africa",
    "Nepal": "Asia",
    "New Caledonia": "Oceania",
    "New Zealand": "Oceania",
    "Nicaragua": "North America",
    "Netherlands": "Europe",
    "Niger": "Africa",
    "Nigeria": "Africa",
    "Northern Ireland": "Europe",
    "North Macedonia": "Europe",
    "Norway": "Europe",
    "Oman": "Asia",
    "Austria": "Europe",
    "East Timor": "Asia",
    "Pakistan": "Asia",
    "Palestine": "Asia",
    "Panama": "North America",
    "Papua New Guinea": "Oceania",
    "Paraguay": "South America",
    "Peru": "South America",
    "Philippines": "Asia",
    "Poland": "Europe",
    "Portugal": "Europe",
    "Puerto Rico": "North America",
    "Rwanda": "Africa",
    "Romania": "Europe",
    "Russia": "Europe",
    "Saint Kitts and Nevis": "North America",
    "Saint Lucia": "North America",
    "Saint Vincent and the Grenadines": "North America",
    "Solomon Islands": "Oceania",
    "Zambia": "Africa",
    "Samoa": "Oceania",
    "San Marino": "Europe",
    "Sao Tome and Principe": "Africa",
    "Saudi Arabia": "Asia",
    "Scotland": "Europe",
    "Sweden": "Europe",
    "Switzerland": "Europe",
    "Senegal": "Africa",
    "Serbia": "Europe",
    "Seychelles": "Africa",
    "Sierra Leone": "Africa",
    "Zimbabwe": "Africa",
    "Singapore": "Asia",
    "Slovakia": "Europe",
    "Slovenia": "Europe",
    "Somalia": "Africa",
    "Spain": "Europe",
    "Sri Lanka": "Asia",
    "South Africa": "Africa",
    "Sudan": "Africa",
    "South Sudan": "Africa",
    "Suriname": "South America",
    "Syria": "Asia",
    "Tajikistan": "Asia",
    "Tahiti": "Oceania",
    "Taiwan": "Asia",
    "Tanzania": "Africa",
    "Thailand": "Asia",
    "Togo": "Africa",
    "Tonga": "Oceania",
    "Trinidad and Tobago": "North America",
    "Chad": "Africa",
    "Czech Republic": "Europe",
    "Tunisia": "Africa",
    "Turkey": "Asia",
    "Turkmenistan": "Asia",
    "Turks and Caicos Islands": "North America",
    "Uganda": "Africa",
    "Ukraine": "Europe",
    "Hungary": "Europe",
    "Uruguay": "South America",
    "Uzbekistan": "Asia",
    "Vanuatu": "Oceania",
    "Venezuela": "South America",
    "United Arab Emirates": "Asia",
    "United States": "North America",
    "Vietnam": "Asia",
    "Wales": "Europe",
    "Central African Republic": "Africa",
    "Cyprus": "Asia"
}

In [120]:
# Add home_continent and away_continent columns
df_shootouts['home_continent'] = df_shootouts['home_team'].map(continents)
df_shootouts['away_continent'] = df_shootouts['away_team'].map(continents)

In [121]:
#Fill na-values
df_shootouts["home_continent"] = df_shootouts["home_continent"].fillna("Country not in Fifa Ranking")
df_shootouts["away_continent"] = df_shootouts["away_continent"].fillna("Country not in Fifa Ranking")

In [122]:
#drop na-values
no_country_in_fifa_ranking_both = df_shootouts[(df_shootouts["home_continent"] == "Country not in Fifa Ranking") | (df_shootouts["away_continent"] == "Country not in Fifa Ranking")]

df_shootouts_fifa = df_shootouts.drop(no_country_in_fifa_ranking_both.index)

In [123]:
df_shootouts_fifa = df_shootouts_fifa.drop(columns="first_shooter")

In [102]:
df_fifa_ranking = pd.read_csv("files/input/fifa_ranking-2024-04-04.csv")

In [124]:
df_shootouts_fifa["date"] = pd.to_datetime(df_shootouts_fifa["date"])

In [104]:
df_shootouts_fifa.dtypes

date              datetime64[ns]
home_team                 object
away_team                 object
winner                    object
home_continent            object
away_continent            object
dtype: object

In [105]:
df_fifa_ranking["rank_date"] = pd.to_datetime(df_fifa_ranking["rank_date"])

In [106]:
df_fifa_ranking.dtypes

rank                      float64
country_full               object
country_abrv               object
total_points              float64
previous_points           float64
rank_change                 int64
confederation              object
rank_date          datetime64[ns]
dtype: object

In [125]:
df_shootouts_fifa = df_shootouts_fifa[df_shootouts_fifa["date"] >= pd.Timestamp('1991-01-01')]

In [108]:
df_shootouts_fifa.head()

Unnamed: 0,date,home_team,away_team,winner,home_continent,away_continent
149,1991-01-21,Cameroon,Algeria,Cameroon,Africa,Africa
150,1991-01-21,Senegal,Ivory Coast,Ivory Coast,Africa,Africa
151,1991-01-22,Senegal,Algeria,Senegal,Africa,Africa
152,1991-01-23,Ivory Coast,Cameroon,Ivory Coast,Africa,Africa
153,1991-06-14,South Korea,Australia,South Korea,Asia,Oceania


In [116]:
from joblib import Parallel, delayed
import multiprocessing

def apply_fifa_ranking_row(row, df_fifa_ranking):
    if row.name % 500 == 0:
        print(row.name)

    date = row['date']
    home_team = row['home_team']
    away_team = row['away_team']
    
    home_mask = (df_fifa_ranking['rank_date'] <= date) & (df_fifa_ranking['country_full'] == home_team)
    home_ranking = df_fifa_ranking[home_mask].sort_values("rank_date", ascending=False).head(1)

    away_mask = (df_fifa_ranking['rank_date'] <= date) & (df_fifa_ranking['country_full'] == away_team)
    away_ranking = df_fifa_ranking[away_mask].sort_values("rank_date", ascending=False).head(1)
    
    row["home_team_total_points"] = np.nan
    row["away_team_total_points"] = np.nan

    if home_ranking.empty or away_ranking.empty:
        return row    
    
    home_curr_points = home_ranking['total_points'].values[0]
    away_curr_points = away_ranking['total_points'].values[0]
    
    row["home_team_total_points"] = home_curr_points
    row["away_team_total_points"] = away_curr_points

    return row

def apply_fifa_ranking_chunk(df_chunk, df_fifa_ranking):
    df_chunk = df_chunk.apply(lambda row: apply_fifa_ranking_row(row, df_fifa_ranking), axis=1)
    return df_chunk

def parallel_apply(df, func, num_cores=multiprocessing.cpu_count()):
    df_split = np.array_split(df, num_cores)
    pool = Parallel(n_jobs=num_cores)
    
    results_l = []
    for result in pool(delayed(func)(chunk) for chunk in df_split):
        results_l.append(result)
            
    return pd.concat(results_l, axis=0)

def add_fifa_ranking(df_fifa_ranking, df):
    df = parallel_apply(df, lambda chunk: apply_fifa_ranking_chunk(chunk, df_fifa_ranking))
    return df

In [126]:
df_shootouts_fifa=add_fifa_ranking(df_fifa_ranking, df_shootouts_fifa)

  return bound(*args, **kwds)


In [127]:
df_shootouts_fifa.to_csv("penis.csv")