In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2020-2021_GOOD.csv
/kaggle/input/2019-2020_GOOD.csv
/kaggle/input/2022-2023_GOOD.csv
/kaggle/input/2018-2019_GOOD.csv
/kaggle/input/2021-2022_GOOD.csv
/kaggle/input/2023-2024_GOOD.csv


LOADING THE NEEDED PACKAGES

In [3]:
!pip install torch
!pip install pytorch-tabnet


Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [6]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

import optuna
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score, accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from collections import Counter

import xgboost
from xgboost import XGBClassifier

import torch
from pytorch_tabnet.tab_model import TabNetClassifier
import optuna.visualization.matplotlib as optuna_vis


**LOADING IN THE DATA**



In [7]:
data_dir = '/kaggle/input/'

# List files in the directory to confirm their existence
print(os.listdir(data_dir))

# Define file paths
training2018 = os.path.join(data_dir, '2018-2019_GOOD.csv')
training2019 = os.path.join(data_dir, '2019-2020_GOOD.csv')
training2020 = os.path.join(data_dir, '2020-2021_GOOD.csv')
training2021 = os.path.join(data_dir, '2021-2022_GOOD.csv')
training2022 = os.path.join(data_dir, '2022-2023_GOOD.csv')
testset = os.path.join(data_dir, '2023-2024_GOOD.csv')

# 
df_2018 = pd.read_csv(training2018, sep=';')
df_2019 = pd.read_csv(training2019, sep=';')
df_2020 = pd.read_csv(training2020, sep=';')
df_2021 = pd.read_csv(training2021, sep=';')
df_2022 = pd.read_csv(training2022, sep=';')
df_Tes = pd.read_csv(testset, sep=';')

['2020-2021_GOOD.csv', '2019-2020_GOOD.csv', '2022-2023_GOOD.csv', '2018-2019_GOOD.csv', '2021-2022_GOOD.csv', '2023-2024_GOOD.csv']


**PREPROCESSING - FEATURE TRANSFORMATION**


In [8]:
# Function to process each dataframe (convert 'Date' to datetime, extract 'Day_of_Week', categorize 'Time')
def process_dataframe(df):
    # Convert 'Date' to datetime and get the day of the week
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
    df['Day_of_Week'] = df['Date'].dt.strftime('%A')
    
    # Categorize 'Day_of_Week' to make it a categorical variable
    df['Day_of_Week'] = pd.Categorical(df['Day_of_Week'], categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], ordered=True)

    # Convert 'Time' to proper time format without seconds
    df['Time'] = pd.to_datetime(df['Time'], format='%H:%M', errors='coerce').dt.time

    # Function to categorize time into parts of the day
    def categorize_time(time):
        if pd.isnull(time):
            return 'Unknown'
        elif time >= pd.to_datetime('12:00').time() and time < pd.to_datetime('15:00').time():
            return 'Early Afternoon'
        elif time >= pd.to_datetime('15:00').time() and time < pd.to_datetime('18:00').time():
            return 'Late Afternoon'
        elif time >= pd.to_datetime('18:00').time() and time <= pd.to_datetime('23:59').time():
            return 'Evening'
        else:
            return 'Morning'
    
    # Apply the time categorization function
    df['Time_Category'] = df['Time'].apply(categorize_time)
    
    def assign_points(ftr):
        if ftr == 'H':  # Home team won
            return pd.Series({'HPG': 3, 'APG': 0})
        elif ftr == 'A':  # Away team won
            return pd.Series({'HPG': 0, 'APG': 3})
        elif ftr == 'D':  # Draw
            return pd.Series({'HPG': 1, 'APG': 1})
        else:
            return pd.Series({'HPG': None, 'APG': None})  # Voor missende waarden

    # Pas de functie toe om 'HPG' (Home Points Gained) en 'APG' (Away Points Gained) te maken
    df[['HFPG', 'AFPG']] = df['FTR'].apply(assign_points)
    df[['HHPG', 'AHPG']] = df['HTR'].apply(assign_points)
    
    return df

# List of dataframes to process (replace df_2019, df_2020, etc., with your actual dataframes)
dfs = [df_2018, df_2019, df_2020, df_2021, df_2022, df_Tes]

# Process all dataframes using a loop and reassign them
for i in range(len(dfs)):
    dfs[i] = process_dataframe(dfs[i])

# Assign the processed dataframes back to their original names
df_2018, df_2019, df_2020, df_2021, df_2022, df_Tes = dfs


In [10]:
def calculate_previous_match_stats(df, features_home, features_away, window=5):
    df = df.copy()
    df.sort_values(by='Date', inplace=True)  # Sorteer op datum
    teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()  # Unieke teams vinden

    # Initialiseer de nieuwe kolommen met NaN
    for feature_home, feature_away in zip(features_home, features_away):
        df[f'{feature_home}_HomeAvg'] = np.nan
        df[f'{feature_away}_AwayAvg'] = np.nan

    # Bereken rolling averages voor elk team, zowel thuis als uit
    for team in teams:
        # Filter de wedstrijden waar het team speelde (thuis of uit)
        team_matches = df[(df['HomeTeam'] == team) | (df['AwayTeam'] == team)].copy()
        team_matches = team_matches.sort_values(by='Date')

        # Doorloop elke wedstrijd en bereken het gemiddelde op basis van eerdere wedstrijden
        for i, match in team_matches.iterrows():
            # Selecteer de laatste 'window' aantal wedstrijden van het team
            past_matches = team_matches[team_matches['Date'] < match['Date']].tail(window)

            # Alleen verdergaan als er eerdere wedstrijden zijn
            if len(past_matches) > 0:
                # Bereken rolling averages voor de laatste 'window' aantal wedstrijden
                for feature_home, feature_away in zip(features_home, features_away):
                    # Voor thuisteam statistieken
                    if match['HomeTeam'] == team:
                        feature_values = pd.concat([
                            past_matches.loc[past_matches['HomeTeam'] == team, feature_home],
                            past_matches.loc[past_matches['AwayTeam'] == team, feature_away]
                        ])
                        home_avg = feature_values.mean()
                        df.loc[i, f'{feature_home}_HomeAvg'] = round(home_avg, 2)  # Afronden op 2 decimalen

                    # Voor uitteam statistieken
                    if match['AwayTeam'] == team:
                        feature_values = pd.concat([
                            past_matches.loc[past_matches['HomeTeam'] == team, feature_home],
                            past_matches.loc[past_matches['AwayTeam'] == team, feature_away]
                        ])
                        away_avg = feature_values.mean()
                        df.loc[i, f'{feature_away}_AwayAvg'] = round(away_avg, 2)  # Afronden op 2 decimalen
            else:
                # Als er geen eerdere wedstrijden zijn, laten we de waarden als NaN staan
                for feature_home, feature_away in zip(features_home, features_away):
                    df.loc[i, f'{feature_home}_HomeAvg'] = np.nan
                    df.loc[i, f'{feature_away}_AwayAvg'] = np.nan

    # Verwijder de oude kolommen die nu overbodig zijn
    all_features = features_home + features_away
    df = df.drop(columns=all_features, errors='ignore')

    return df

# Specificaties van features voor thuis en uit
features_HOME = ['HS', 'HST', 'HF', 'HC', 'HY', 'HR', 'FTHG', 'HTHG', 'HFPG', 'HHPG']  # Thuisteam statistieken
features_AWAY = ['AS', 'AST', 'AF', 'AC', 'AY', 'AR', 'FTAG', 'HTAG', 'AFPG', 'AHPG']  # Uitteam statistieken

# Voorbeeld dataframes gebruiken om de functie te testen
df_2018 = calculate_previous_match_stats(df_2018, features_HOME, features_AWAY, window=5)
df_2019 = calculate_previous_match_stats(df_2019, features_HOME, features_AWAY, window=5)
df_2020 = calculate_previous_match_stats(df_2020, features_HOME, features_AWAY, window=5)
df_2021 = calculate_previous_match_stats(df_2021, features_HOME, features_AWAY, window=5)
df_2022 = calculate_previous_match_stats(df_2022, features_HOME, features_AWAY, window=5)
df_Tes = calculate_previous_match_stats(df_Tes, features_HOME, features_AWAY, window=5)

In [11]:
features_HOME_AVG = ['HS_HomeAvg', 'HST_HomeAvg', 'HF_HomeAvg', 'HC_HomeAvg', 'HY_HomeAvg', 'HR_HomeAvg', 'FTHG_HomeAvg', 'HTHG_HomeAvg', 'HFPG_HomeAvg', 'HHPG_HomeAvg', 'FRHDEF', 'FRHMID', 'FRHATT', 'FRHAVG']
features_AWAY_AVG = ['AS_AwayAvg', 'AST_AwayAvg', 'AF_AwayAvg', 'AC_AwayAvg', 'AY_AwayAvg', 'AR_AwayAvg', 'FTAG_AwayAvg', 'HTAG_AwayAvg', 'AFPG_AwayAvg', 'AHPG_AwayAvg', 'FRADEF', 'FRAMID', 'FRAATT', 'FRAAVG']

features_Home_Processed = ['S', 'ST', 'F', 'C', 'Y', 'R', 'FTG', 'HTG', 'FPG', 'HPG', 'FRDEF', 'FRMID', 'FRATT', 'FRAVG']

def add_difference_columns(df, features_Home_Processed, features_AWAY_AVG, features_HOME_AVG):
    df = df.copy()
    
    # Voor elke feature in de verwerkte lijst
    for i in range(len(features_Home_Processed)):
        # Namen van de kolommen
        home_avg_col = features_HOME_AVG[i]
        away_avg_col = features_AWAY_AVG[i]
        diff_col = f'Diff_{features_Home_Processed[i]}'

        # Bereken het verschil en sla dit op in een nieuwe kolom
        df[diff_col] = df[home_avg_col] - df[away_avg_col]
        
    return df

# Pas de functie toe op je datasets
df_2018 = add_difference_columns(df_2018, features_Home_Processed, features_AWAY_AVG, features_HOME_AVG)
df_2019 = add_difference_columns(df_2019, features_Home_Processed, features_AWAY_AVG, features_HOME_AVG)
df_2020 = add_difference_columns(df_2020, features_Home_Processed, features_AWAY_AVG, features_HOME_AVG)
df_2021 = add_difference_columns(df_2021, features_Home_Processed, features_AWAY_AVG, features_HOME_AVG)
df_2022 = add_difference_columns(df_2022, features_Home_Processed, features_AWAY_AVG, features_HOME_AVG)
df_Tes = add_difference_columns(df_Tes, features_Home_Processed, features_AWAY_AVG, features_HOME_AVG)


In [12]:
#Samenvoegen trainingsdata
final_training= pd.concat([df_2018, df_2019, df_2020, df_2021, df_2022], axis=0)

In [13]:
# Verwijder 'Time', 'Date', 'Div', 'HTR' en andere opgegeven kolommen
columns_to_drop = ['Time', 'Date', 'Div', 'HTR'] + features_HOME_AVG + features_AWAY_AVG
data = final_training.drop(columns=columns_to_drop, errors='ignore')
df_Tes = df_Tes.drop(columns=columns_to_drop, errors='ignore')

In [14]:
print(data.head())
print(df_Tes.head())

       HomeTeam        AwayTeam FTR     Referee  B365H  B365D  B365A  \
0    Man United       Leicester   H  A Marriner   1.57    3.9   7.50   
1   Bournemouth         Cardiff   H    K Friend   1.90    3.6   4.50   
2        Fulham  Crystal Palace   A      M Dean   2.50    3.4   3.00   
3  Huddersfield         Chelsea   A  C Kavanagh   6.50    4.0   1.61   
4     Newcastle       Tottenham   A  M Atkinson   3.90    3.5   2.04   

  Day_of_Week    Time_Category  Diff_S  ...  Diff_Y  Diff_R  Diff_FTG  \
0      Friday          Evening     NaN  ...     NaN     NaN       NaN   
1    Saturday   Late Afternoon     NaN  ...     NaN     NaN       NaN   
2    Saturday   Late Afternoon     NaN  ...     NaN     NaN       NaN   
3    Saturday   Late Afternoon     NaN  ...     NaN     NaN       NaN   
4    Saturday  Early Afternoon     NaN  ...     NaN     NaN       NaN   

   Diff_HTG  Diff_FPG  Diff_HPG  Diff_FRDEF  Diff_FRMID  Diff_FRATT  \
0       NaN       NaN       NaN           5           7  

**DATA PREPARATION - HANDLING MISSING VALUES**

In [15]:
# Checking for missing values
# Totaal aantal missende waarden in de DataFrames
total_missing_data = data.isna().sum().sum()
total_missing_df_Tes = df_Tes.isna().sum().sum()

# Print het totale aantal missende waarden voor elke DataFrame
print(f'Total missing values in df_training: {total_missing_data}')
print(f'Total missing values in df_test: {total_missing_df_Tes}')

Total missing values in df_training: 520
Total missing values in df_test: 100


In [16]:
# Verondersteld dat je pandas al hebt geïmporteerd en de datasets hebt ingeladen
data_cleaned = data.dropna()
df_Tes_cleaned = df_Tes.dropna()

# Controleer de nieuwe vorm (aantal rijen en kolommen) van de opgeschoonde DataFrames
print(data_cleaned.shape)
print(df_Tes_cleaned.shape)



(1848, 23)
(370, 23)


**DATA PREPARATION - LABEL ENCODING**

In [17]:
# Maak een kopie van de originele datasets
data_original = data_cleaned.copy()
df_test = df_Tes_cleaned.copy()

# Voeg een kolom toe aan beide datasets

data_original['Set'] = 1# 1 voor trainingsdata
df_test['Set'] = 2        # 2 voor testdata

# Combineer de datasets
combined_data = pd.concat([data_original, df_test], ignore_index=True)

# LABEL ENCODING
# Stap 2: Identificeer categorische kolommen voor label encoding
categorical_columns = combined_data.select_dtypes(include=['object']).columns.tolist()
print("Categorische kolommen:", categorical_columns)

# Controleer en voeg 'Day_of_Week' toe aan de lijst van categorische kolommen als het niet aanwezig is
if 'Day_of_Week' not in categorical_columns:
    categorical_columns.append('Day_of_Week')

# Initialiseer de LabelEncoder
label_encoders = {}  # Opslaan van label encoders voor eventueel later gebruik

# Pas Label Encoding toe op de categorische kolommen en sla deze op in combined_data
for col in categorical_columns:
    if col in combined_data.columns:  # Zorg ervoor dat de kolom bestaat
        le = LabelEncoder()
        combined_data[col] = le.fit_transform(combined_data[col])
        label_encoders[col] = le  # Bewaar de label encoder voor later gebruik

print(combined_data.head())


Categorische kolommen: ['HomeTeam', 'AwayTeam', 'FTR', 'Referee', 'Time_Category']
   HomeTeam  AwayTeam  FTR  Referee  B365H  B365D  B365A  Day_of_Week  \
0        26         2    0       32   2.10    3.6   3.70            2   
1        23        10    2        3   1.28    6.0  12.00            2   
2        13        27    2       20   2.04    3.5   3.90            2   
3         7         0    2       19   1.80    4.0   4.50            2   
4         6        18    1        5   3.25    3.1   2.54            2   

   Time_Category  Diff_S  ...  Diff_R  Diff_FTG  Diff_HTG  Diff_FPG  Diff_HPG  \
0              2    -7.0  ...     0.0      -2.0      -1.0      -3.0      -3.0   
1              2     0.0  ...     0.0       2.0       2.0       3.0       3.0   
2              2     2.0  ...     0.0      -1.0      -1.0      -1.0      -1.0   
3              2     4.0  ...     0.0       3.0       2.0       3.0       3.0   
4              0    -5.0  ...     0.0      -1.0      -1.0       0.0      

In [18]:
train_data = combined_data[combined_data['Set'] == 1].drop(columns=['Set'])
test_data = combined_data[combined_data['Set'] == 2].drop(columns=['Set'])

# Nu zijn train_data en test_data klaar voor gebruik
print("Train Data:")
print(train_data.head())
print(train_data.shape)
print("\nTest Data:")
print(test_data.head())

Train Data:
   HomeTeam  AwayTeam  FTR  Referee  B365H  B365D  B365A  Day_of_Week  \
0        26         2    0       32   2.10    3.6   3.70            2   
1        23        10    2        3   1.28    6.0  12.00            2   
2        13        27    2       20   2.04    3.5   3.90            2   
3         7         0    2       19   1.80    4.0   4.50            2   
4         6        18    1        5   3.25    3.1   2.54            2   

   Time_Category  Diff_S  ...  Diff_Y  Diff_R  Diff_FTG  Diff_HTG  Diff_FPG  \
0              2    -7.0  ...     1.0     0.0      -2.0      -1.0      -3.0   
1              2     0.0  ...     1.0     0.0       2.0       2.0       3.0   
2              2     2.0  ...     1.0     0.0      -1.0      -1.0      -1.0   
3              2     4.0  ...    -1.0     0.0       3.0       2.0       3.0   
4              0    -5.0  ...    -1.0     0.0      -1.0      -1.0       0.0   

   Diff_HPG  Diff_FRDEF  Diff_FRMID  Diff_FRATT  Diff_FRAVG  
0      -3.0 

**EDA**

In [None]:
# Correlatiematrix berekenen voor de volledige trainingset
correlation_matrix = train_data.corr()

# Plot de volledige correlatiematrix
plt.figure(figsize=(16, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='PuOr', linewidths=0.5)
plt.title(f'Correlation Matrix', fontsize=14, weight='bold')
plt.xlabel('Features', fontsize=12)
plt.ylabel('Features', fontsize=12)

plt.tight_layout()
plt.savefig(f'Correlation_Matrix.png', format='png')  # Save as high-quality PNG
plt.show()


In [None]:
# Define the mapping for FTR labels
ftr_mapping = {0: 'A', 1: 'D', 2: 'H'}

# Apply the mapping to the 'FTR' column
train_data['FTR'] = train_data['FTR'].map(ftr_mapping)

# Select all columns except 'FTR' (the target column)
all_features = [col for col in train_data.columns if col != 'FTR']

# Create the figure for the boxplots
plt.figure(figsize=(18, 20))  # Adjusting the figure size to fit all plots

# Total number of features to plot
n_features = len(all_features)

# Loop through the features and create a boxplot for each one
for i, feature in enumerate(all_features):
    plt.subplot((n_features // 3) + 1, 3, i + 1)  # Organizing into rows and columns
    sns.boxplot(x='FTR', y=feature, data=train_data, palette='PuOr', showfliers=False)  # Hiding the outliers
    plt.title(f'Boxplot of {feature} by FTR')
    plt.xlabel('FTR Outcome')
    plt.ylabel(feature)

# Adjust the layout to ensure everything fits well
plt.tight_layout()
plt.savefig(f'Boxplots.png', format='png')
plt.show()



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

# Define the mapping for FTR labels
ftr_mapping = {0: 'A', 1: 'D', 2: 'H'}

order = ['H', 'D', 'A']
# Apply the mapping to the 'FTR' column
train_data['FTR'] = train_data['FTR'].map(ftr_mapping)

# Specify the features and group them for the desired layout
top_row_features = ['B365H', 'B365A']
bottom_row_features = ['Diff_HPG', 'Diff_HTG']

# Create the figure for the boxplots
plt.figure(figsize=(12, 6))  # Adjusting the figure size to fit the selected plots

# Loop through the top row features and create a boxplot for each one
for i, feature in enumerate(top_row_features):
    plt.subplot(2, 2, i + 1)  # Organizing the plots into two rows and two columns
    sns.boxplot(x='FTR', y=feature, data=train_data, palette='PuOr', showfliers=False, order = order)  # Hide outliers
    plt.title(f'Boxplot of {feature} by FTR')
    plt.xlabel('FTR Outcome')
    plt.ylabel(feature)

# Loop through the bottom row features and create a boxplot for each one
for i, feature in enumerate(bottom_row_features):
    plt.subplot(2, 2, len(top_row_features) + i + 1)  # Continue the layout for the bottom row
    sns.boxplot(x='FTR', y=feature, data=train_data, palette='PuOr', showfliers=False, order = order)  # Hide outliers
    plt.title(f'Boxplot of {feature} by FTR')
    plt.xlabel('FTR Outcome')
    plt.ylabel(feature)

# Adjust the layout to ensure everything fits well
plt.tight_layout()
plt.savefig('boxplot_by_FTR.png', format='png', dpi=300)  # Save as high-quality PNG
plt.show()



**Data preparation**

In [None]:
# Functie om datasetvarianten te maken
def create_data_variants(X, y, add_odds, balance_data):
    # Lijst met kolommen die betrekking hebben op odds
    odds_columns = ['B365H', 'B365D', 'B365A']

    # Verwijder de odds-kolommen als add_odds False is
    if not add_odds:
        X = X.drop(columns=odds_columns, errors='ignore')

    # Balanceren van de data met SMOTE indien balance_data True is
    if balance_data:
        smote = SMOTE(random_state=42)
        X, y = smote.fit_resample(X, y)
    
    return X, y

# Hoofdfunctie voor datasetvoorbereiding
def prepare_datasets(train_data, test_data, target_column='FTR', add_odds=True):
    # Scheid de features van de target
    X = train_data.drop(columns=[target_column])
    y = train_data[target_column]

    # Dataset varianten
    datasets = {}

    # Variaties maken van de trainingset
    datasets['trainmet_odds_met_balancing'] = create_data_variants(X, y, add_odds=True, balance_data=True)
    datasets['trainmet_odds_zonder_balancing'] = create_data_variants(X, y, add_odds=True, balance_data=False)
    datasets['trainzonder_odds_met_balancing'] = create_data_variants(X, y, add_odds=False, balance_data=True)
    datasets['trainzonder_odds_zonder_balancing'] = create_data_variants(X, y, add_odds=False, balance_data=False)

    # Scheid de testset in X en y
    X_test = test_data.drop(columns=[target_column])
    y_test = test_data[target_column]
    
    # Debug: Bekijk de kolomnamen van de testset
    print("Testset kolomnamen:", X_test.columns.tolist())

    datasets['testmet_odds_met_balancing'] = (X_test.copy(), y_test.copy())  # Testset met odds en met balancing
    datasets['testmet_odds_zonder_balancing'] = (X_test.copy(), y_test.copy())  # Testset met odds en zonder balancing
    
    # Controleer op het bestaan van odds kolommen voordat we ze verwijderen
    odds_columns = ['B365H', 'B365D', 'B365A']
    X_test_without_odds = X_test.drop(columns=odds_columns, errors='ignore')  # Verwijder indien aanwezig
    datasets['testzonder_odds_met_balancing'] = (X_test_without_odds.copy(), y_test.copy())  # Testset zonder odds en met balancing
    datasets['testzonder_odds_zonder_balancing'] = (X_test_without_odds.copy(), y_test.copy())  # Testset zonder odds en zonder balancing

    return datasets


datasets = prepare_datasets(train_data, test_data)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Stel de stijl in voor een consistente weergave
sns.set(style="whitegrid")

# Maak de figuur en assen voor de plot
# Filter de datasets die beginnen met "test"
train_datasets = {name: data for name, data in datasets.items() if not name.startswith("test")}
num_train_datasets = len(train_datasets)

plt.figure(figsize=(16, 4))  # Pas de grootte aan afhankelijk van het aantal subplots

# Definieer de volgorde van de categorieën en het kleurenpalet
order = ['H', 'D', 'A']
palette = 'PuOr'

# Loop door alleen de trainingsdatasets en plot elk in een subplot
for i, (name, (X, y)) in enumerate(train_datasets.items()):
    plt.subplot((num_train_datasets + 1) // 2, 2, i + 1)  # Organiseer in 2 kolommen
    sns.countplot(x=y, palette=palette, order=order)  # y bevat de 'FTR' target data
    plt.title(f'Distribution of FTR - {name}', fontsize=14, weight='bold')
    plt.xlabel('Full Time Result', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)

# Pas layout aan en toon de plot
plt.tight_layout()
plt.savefig('countplot_by_FTR.png', format='png', dpi=300)  # Save as high-quality PNG
plt.show()







**FEATURE IMPORTANCE**

In [None]:
# Functie om feature importances te berekenen met Boruta
def analyze_feature_importance(X, y):
    # Train een Random Forest-model
    rf = RandomForestClassifier(random_state=42)

    # Gebruik Boruta om de feature importances te bepalen
    boruta = BorutaPy(
        estimator=rf,
        n_estimators='auto',
        verbose=1,
        random_state=42
    )

    # Fit de Boruta op de data
    boruta.fit(X.values, y.values)

    # Verkrijg de indices van de geselecteerde features
    ranking = boruta.ranking_

    # Maak een DataFrame voor de importances en rankings
    importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Ranking': ranking
    })

    # Identificeer goede features
    good_features = importance_df[importance_df['Ranking'] == 1]['Feature'].tolist()

    return good_features  # Teruggeven van goede features

# Loop door de datasetvarianten en analyseer de feature importances voor elke variant
good_features_dict = {}  # Om de goede features per variant op te slaan
for variant_name, (X_train, y_train) in datasets.items():
    if "test" not in variant_name:  # Sla testsets over, alleen trainingsets gebruiken
        print(f"\nAnalyzing Feature Importance - {variant_name}")
        good_features = analyze_feature_importance(X_train, y_train)
        good_features_dict[variant_name] = good_features

# Print de goede features per trainingsvariant
for variant_name, features in good_features_dict.items():
    print(f"\nGoede Features voor {variant_name}: {features}")


In [None]:
# Je hebt de goede features voor elke variant
good_features_dict = {
    'trainmet_odds_met_balancing': ['B365H', 'B365D', 'B365A', 'Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FPG', 'Diff_HPG'],
    'trainmet_odds_zonder_balancing': ['B365H', 'B365A', 'Diff_S'],
    'trainzonder_odds_met_balancing': ['Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FTG', 'Diff_FPG', 'Diff_HPG', 'Diff_FRDEF', 'Diff_FRATT'],
    'trainzonder_odds_zonder_balancing': ['Diff_S', 'Diff_ST', 'Diff_C', 'Diff_FRATT']
}

for variant_name, (X, y) in datasets.items():
    # Controleer of deze variant in de goede features dictionary staat
    if variant_name in good_features_dict:
        good_features = good_features_dict[variant_name]
        
        # Houd alleen de goede features in de DataFrame
        X_filtered = X[good_features]
        
        # Vervang de originele dataset met de gefilterde dataset
        datasets[variant_name] = (X_filtered, y)

# Loop door de datasets opnieuw om ook testsets te filteren
for variant_name, (X, y) in datasets.items():
    # Controleer of deze variant een testset is
    if "test" in variant_name:  
        # Krijg de bijbehorende goede features van de trainingset
        train_variant_name = variant_name.replace("test", "train")

        # Zorg ervoor dat de trainingset in de dictionary bestaat
        if train_variant_name in good_features_dict:
            selected_features = good_features_dict[train_variant_name]

            # Controleer of de geselecteerde features in de testset aanwezig zijn
            common_features = [feature for feature in selected_features if feature in X.columns]
            if common_features:  # Controleer of er gemeenschappelijke features zijn
                X_filtered = X[common_features]  # Houd alleen de gemeenschappelijke features

                # Vervang de originele testset in datasets met de nieuwe testset
                datasets[variant_name] = (X_filtered, y)

# Print de bijgewerkte datasets
for variant_name, (X_updated, y_updated) in datasets.items():
    print(f"\nBijgewerkte Dataset - {variant_name}:")
    print(X_updated.head())  # Toon de eerste paar rijen van de bijgewerkte dataset





**NEEDED PYTROCH**

In [None]:
pip install xgboost optuna matplotlib scikit-learn pandas


In [None]:
!pip install torch
!pip install pytorch-tabnet


In [None]:
import os

# Verwijder alle afbeeldingen met de extensie .png
for filename in os.listdir():
    if filename.endswith(".png"):
        os.remove(filename)
        print(f"Deleted: {filename}")


--> shoutout invoegen naar dat ene Notebook: Gibbons, N. (2021). Tuning TabNet with Optuna [Notebook]. Kaggle. Retrieved from https://www.kaggle.com/code/neilgibbons/tuning-tabnet-with-optuna _-. used for Experiment three

**MODELS WITH OPTUNA**

In [None]:
# Function to calculate profit based on predictions
def calculate_profit(y_pred, y_test, odds, stake=1):
    profit = 0
    for prediction, actual, odd in zip(y_pred, y_test, odds):
        if prediction == actual:
            if prediction == 2:
                profit += stake * odd[0] - stake  # B365H
            elif prediction == 1:
                profit += stake * odd[1] - stake  # B365D
            elif prediction == 0:
                profit += stake * odd[2] - stake  # B365A
        else:
            profit -= stake
    return profit

# Dictionary to store results for each dataset variant
results_dict = {}

# Function to tune and evaluate RandomForest using Optuna
def tune_and_evaluate_rf_optuna(X_train, y_train, X_test, y_test, odds_test):
    def objective_rf(trial):
        rf_params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),  # Number of trees
            'max_depth': trial.suggest_int('max_depth', 3, 10),  # Maximum depth of trees
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 11),  # Minimum samples to split
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),  # Minimum samples at leaf node
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),  # Number of features to consider
            'bootstrap': trial.suggest_categorical('bootstrap', [True, False])  # Whether to bootstrap samples
        }

        rf = RandomForestClassifier(random_state=42, **rf_params)
        pipeline = Pipeline([('rf', rf)])

        # Accuracy as scoring function
        accuracy_scorer = make_scorer(accuracy_score)
        scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=accuracy_scorer, n_jobs=-1)
        
        return np.mean(scores)

    study_rf = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
    study_rf.optimize(objective_rf, n_trials=30)

    # Best hyperparameters
    print("Best hyperparameters:", study_rf.best_params)

    # Train model with best parameters
    best_params = study_rf.best_params
    rf_tuned = RandomForestClassifier(random_state=42, **best_params)
    rf_tuned.fit(X_train, y_train)
    y_pred = rf_tuned.predict(X_test)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test set accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title("Confusion Matrix: RandomForest")
    plt.show()

    # Calculate profit
    total_profit = calculate_profit(y_pred, y_test, odds_test)

    return rf_tuned, y_pred

# Loop through the dataset variants
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:  # Skip test variants
        continue  
    
    print(f"\nRunning RF on dataset variant: {variant_name}")

    # Class distribution in training data
    print("Class distribution in training data:", Counter(y_train))

    # Determine the appropriate test set based on the training variant
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0]
        y_test = datasets['testmet_odds_met_balancing'][1]
        odds_test = X_test[['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0]
        y_test = datasets['testmet_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0]
        y_test = datasets['testzonder_odds_met_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0]
        y_test = datasets['testzonder_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    else:
        print(f"Variant name {variant_name} does not match any known test set, skipping.")
        continue  # If the variant name does not match, skip
    
    # Convert DataFrames to NumPy arrays
    X_train = X_train.values  # Convert training features to NumPy array
    y_train = y_train.values  # Ensure training labels are in the correct format (NumPy array)
    X_test = X_test.values  # Convert test features to NumPy array
    y_test = y_test.values  # Ensure test labels are in the correct format (NumPy array)
    
    # Check if test data is valid
    if X_test is None or y_test is None:
        print(f"Test data for {variant_name} is invalid. Skipping.")
        continue

    # Train the model with the correct datasets
    best_model, y_pred = tune_and_evaluate_rf_optuna(X_train, y_train, X_test, y_test, odds_test)

  # List to store individual predictions
    variant_results = []
    total_profit = 0

    for i in range(len(y_pred)):
        match_profit = calculate_profit([y_pred[i]], [y_test[i]], [odds_test[i]])
        variant_results.append({
            'Predicted': y_pred[i],
            'Actual': y_test[i],
            'B365H': odds_test[i][0],
            'B365D': odds_test[i][1],
            'B365A': odds_test[i][2],
            'Profit': match_profit
        })
        total_profit += match_profit

    print(f"Total profit for {variant_name}: {total_profit}")

    # Create DataFrame for the current variant and store it in the dictionary
    results_df = pd.DataFrame(variant_results)
    results_dict[f"resultsdf_RF_{variant_name}"] = results_df

    print(f"Data for {variant_name} stored as resultsdf_RF_{variant_name}\n")

# Display all DataFrames in results_dict if needed
for name, df in results_dict.items():
    print(f"\n{name}:\n", df.head())

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, make_scorer, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from collections import Counter

# Function to calculate profit based on predictions
def calculate_profit(y_pred, y_test, odds, stake=1):
    profit = 0
    for prediction, actual, odd in zip(y_pred, y_test, odds):
        if prediction == actual:
            if prediction == 2:
                profit += stake * odd[0] - stake  # B365H
            elif prediction == 1:
                profit += stake * odd[1] - stake  # B365D
            elif prediction == 0:
                profit += stake * odd[2] - stake  # B365A
        else:
            profit -= stake
    return profit

# Dictionary to store results for each dataset variant
results_dict = {}

# Function to tune and evaluate RandomForest using Optuna
def tune_and_evaluate_rf_optuna(X_train, y_train, X_test, y_test, odds_test):
    def objective_rf(trial):
        rf_params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),  # Number of trees
            'max_depth': trial.suggest_int('max_depth', 3, 10),  # Maximum depth of trees
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 11),  # Minimum samples to split
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),  # Minimum samples at leaf node
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),  # Number of features to consider
            'bootstrap': trial.suggest_categorical('bootstrap', [True, False])  # Whether to bootstrap samples
        }

        rf = RandomForestClassifier(random_state=42, **rf_params)
        pipeline = Pipeline([('rf', rf)])

        # Accuracy as scoring function
        accuracy_scorer = make_scorer(accuracy_score)
        scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=accuracy_scorer, n_jobs=-1)
        
        return np.mean(scores)

    study_rf = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
    study_rf.optimize(objective_rf, n_trials=30)

    # Best hyperparameters
    print("Best hyperparameters:", study_rf.best_params)

    # Train model with best parameters
    best_params = study_rf.best_params
    rf_tuned = RandomForestClassifier(random_state=42, **best_params)
    rf_tuned.fit(X_train, y_train)
    y_pred = rf_tuned.predict(X_test)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test set accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title("Confusion Matrix: Tuned RandomForest")
    plt.show()

    # Calculate profit
    total_profit = calculate_profit(y_pred, y_test, odds_test)
    print(f"Total profit: {total_profit}")

    return rf_tuned, y_pred

# Loop through the dataset variants
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:  # Skip test variants
        continue  
    
    print(f"\nRunning RF on dataset variant: {variant_name}")

    # Class distribution in training data
    print("Class distribution in training data:", Counter(y_train))

    # Determine the appropriate test set based on the training variant
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0]
        y_test = datasets['testmet_odds_met_balancing'][1]
        odds_test = X_test[['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0]
        y_test = datasets['testmet_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0]
        y_test = datasets['testzonder_odds_met_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0]
        y_test = datasets['testzonder_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    else:
        print(f"Variant name {variant_name} does not match any known test set, skipping.")
        continue  # If the variant name does not match, skip
    
    # Convert DataFrames to NumPy arrays
    X_train = X_train.values  # Convert training features to NumPy array
    y_train = y_train.values  # Ensure training labels are in the correct format (NumPy array)
    X_test = X_test.values  # Convert test features to NumPy array
    y_test = y_test.values  # Ensure test labels are in the correct format (NumPy array)
    
    # Check if test data is valid
    if X_test is None or y_test is None:
        print(f"Test data for {variant_name} is invalid. Skipping.")
        continue

    # Train the model with the correct datasets
    best_model, y_pred = tune_and_evaluate_rf_optuna(X_train, y_train, X_test, y_test, odds_test)

  # List to store individual predictions
    variant_results = []
    total_profit = 0

    for i in range(len(y_pred)):
        match_profit = calculate_profit([y_pred[i]], [y_test[i]], [odds_test[i]])
        variant_results.append({
            'Predicted': y_pred[i],
            'Actual': y_test[i],
            'B365H': odds_test[i][0],
            'B365D': odds_test[i][1],
            'B365A': odds_test[i][2],
            'Profit': match_profit
        })
        total_profit += match_profit

    print(f"Total profit for {variant_name}: {total_profit}")

    # Create DataFrame for the current variant and store it in the dictionary
    results_df = pd.DataFrame(variant_results)
    results_dict[f"resultsdf_RF_{variant_name}"] = results_df

    print(f"Data for {variant_name} stored as resultsdf_RF_{variant_name}\n")

# Display all DataFrames in results_dict if needed
for name, df in results_dict.items():
    print(f"\n{name}:\n", df.head())


In [None]:
# Profit calculation function
def calculate_profit(y_pred, y_test, odds, stake=1):
    profit = 0
    for prediction, actual, odd in zip(y_pred, y_test, odds):
        if prediction == actual:
            if prediction == 2:
                profit += stake * odd[0] - stake  # B365H
            elif prediction == 1:
                profit += stake * odd[1] - stake  # B365D
            elif prediction == 0:
                profit += stake * odd[2] - stake  # B365A
        else:
            profit -= stake
    return profit

# Function to tune and evaluate XGBClassifier with Optuna
def tune_and_evaluate_xgb_optuna(X_train, y_train, X_test, y_test, odds_test):
    def objective_xgb(trial):
        xgb_params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),  # Number of trees
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.05),  # Learning rate
            'max_depth': trial.suggest_int('max_depth', 3, 10),  # Maximum depth of trees
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Fraction of samples to use in each tree
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),  # Minimum child weight
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Fraction of samples to use in each tree
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)  # Fraction of features for each tree
        }

        xgb = XGBClassifier(random_state=42, **xgb_params)
        pipeline = Pipeline([('xgb', xgb)])
        accuracy_scorer = make_scorer(accuracy_score)
        scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=accuracy_scorer, n_jobs=-1)
        return np.mean(scores)

    # Optuna study for hyperparameter optimization
    study_xgb = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
    study_xgb.optimize(objective_xgb, n_trials=30)
    
    # Best hyperparameters
    print("Best hyperparameters:", study_xgb.best_params)

    # Train model with best parameters
    best_params = study_xgb.best_params
    xgb_tuned = XGBClassifier(random_state=42, **best_params)
    xgb_tuned.fit(X_train, y_train)
    y_pred = xgb_tuned.predict(X_test)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test set accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title("Confusion Matrix: XGBoost")
    plt.show()

    # Calculate profit
    total_profit = calculate_profit(y_pred, y_test, odds_test)

    return xgb_tuned, y_pred

results_dict = {}

# Run the tuning and evaluation for each dataset variant
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:
        continue  

    print(f"\nRunning XGBoost on dataset variant: {variant_name}")

    # Select appropriate test set
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0]
        y_test = datasets['testmet_odds_met_balancing'][1]
        odds_test = X_test[['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0]
        y_test = datasets['testmet_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0]
        y_test = datasets['testzonder_odds_met_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0]
        y_test = datasets['testzonder_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    else:
        print(f"Unknown variant name {variant_name}, skipping.")
        continue

    # Convert data to arrays
    X_train = X_train.values
    y_train = y_train.values
    X_test = X_test.values
    y_test = y_test.values

    best_model, y_pred = tune_and_evaluate_xgb_optuna(X_train, y_train, X_test, y_test, odds_test)

    variant_results = []
    total_profit = 0

    for i in range(len(y_pred)):
        match_profit = calculate_profit([y_pred[i]], [y_test[i]], [odds_test[i]])
        variant_results.append({
            'Predicted': y_pred[i],
            'Actual': y_test[i],
            'B365H': odds_test[i][0],
            'B365D': odds_test[i][1],
            'B365A': odds_test[i][2],
            'Profit': match_profit
        })
        total_profit += match_profit

    print(f"Total profit for {variant_name}: {total_profit}")

    # Create DataFrame for the current variant and store it in the dictionary
    results_df = pd.DataFrame(variant_results)
    results_dict[f"resultsdf_RF_{variant_name}"] = results_df

    print(f"Data for {variant_name} stored as resultsdf_RF_{variant_name}\n")

# Display all DataFrames in results_dict if needed
for name, df in results_dict.items():
    print(f"\n{name}:\n", df.head())




In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, make_scorer, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Profit calculation function
def calculate_profit(y_pred, y_test, odds, stake=1):
    profit = 0
    for prediction, actual, odd in zip(y_pred, y_test, odds):
        if prediction == actual:
            if prediction == 2:
                profit += stake * odd[0] - stake  # B365H
            elif prediction == 1:
                profit += stake * odd[1] - stake  # B365D
            elif prediction == 0:
                profit += stake * odd[2] - stake  # B365A
        else:
            profit -= stake
    return profit

# Function to tune and evaluate XGBClassifier with Optuna
def tune_and_evaluate_xgb_optuna(X_train, y_train, X_test, y_test, odds_test):
    def objective_xgb(trial):
        xgb_params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),  # Number of trees
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.05),  # Learning rate
            'max_depth': trial.suggest_int('max_depth', 3, 10),  # Maximum depth of trees
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Fraction of samples to use in each tree
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),  # Minimum child weight
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Fraction of samples to use in each tree
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)  # Fraction of features for each tree
        }

        xgb = XGBClassifier(random_state=42, **xgb_params)
        pipeline = Pipeline([('xgb', xgb)])
        accuracy_scorer = make_scorer(accuracy_score)
        scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=accuracy_scorer, n_jobs=-1)
        return np.mean(scores)

    # Optuna study for hyperparameter optimization
    study_xgb = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
    study_xgb.optimize(objective_xgb, n_trials=30)
    
    # Best hyperparameters
    print("Best hyperparameters:", study_xgb.best_params)

    # Train model with best parameters
    best_params = study_xgb.best_params
    xgb_tuned = XGBClassifier(random_state=42, **best_params)
    xgb_tuned.fit(X_train, y_train)
    y_pred = xgb_tuned.predict(X_test)

    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test set accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title("Confusion Matrix: Tuned XGBoost")
    plt.show()

    # Calculate profit
    total_profit = calculate_profit(y_pred, y_test, odds_test)
    print(f"Total profit: {total_profit}")

    return xgb_tuned, y_pred

results_dict = {}

# Run the tuning and evaluation for each dataset variant
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:
        continue  

    print(f"\nRunning XGBoost on dataset variant: {variant_name}")

    # Select appropriate test set
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0]
        y_test = datasets['testmet_odds_met_balancing'][1]
        odds_test = X_test[['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0]
        y_test = datasets['testmet_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0]
        y_test = datasets['testzonder_odds_met_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0]
        y_test = datasets['testzonder_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    else:
        print(f"Unknown variant name {variant_name}, skipping.")
        continue

    # Convert data to arrays
    X_train = X_train.values
    y_train = y_train.values
    X_test = X_test.values
    y_test = y_test.values

    best_model, y_pred = tune_and_evaluate_xgb_optuna(X_train, y_train, X_test, y_test, odds_test)

    # Store results
    results_dict[f"results_XGB_{variant_name}"] = pd.DataFrame({
        'Predicted': y_pred,
        'Actual': y_test,
        'Profit': [calculate_profit([pred], [act], [odds]) for pred, act, odds in zip(y_pred, y_test, odds_test)]
    })

# Display results
for name, df in results_dict.items():
    print(f"\n{name}:\n", df.head())



In [None]:
# Stel de random seed in voor consistentie
SEED = 42  # Kies een vaste seed voor consistentie
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def evaluate(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    print(f"{model_name} - Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")

def calculate_profit(y_pred, y_test, odds, stake=1):
    profit = 0
    for prediction, actual, odd in zip(y_pred, y_test, odds):
        if prediction == actual:
            if prediction == 2:
                profit += stake * odd[0] - stake  # B365H
            elif prediction == 1:
                profit += stake * odd[1] - stake  # B365D
            elif prediction == 0:
                profit += stake * odd[2] - stake  # B365A
        else:
            profit -= stake
    return profit

def tune_and_evaluate_tabnet(X_train, y_train, X_test, y_test, odds_test):
      
    def Objective_tabnet(trial):
        mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
        n_da = trial.suggest_int("n_da", 8, 32, step=8)
        n_steps = trial.suggest_int("n_steps", 3, 10, step=1)
        gamma = trial.suggest_float("gamma", 0.01, 0.2, step=0.01)
        n_shared = trial.suggest_int("n_shared", 1, 3)
        lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)

        optimizer_params = dict(lr=2e-2, weight_decay=1e-5)
        tabnet_params = dict(
            n_d=n_da, 
            n_a=n_da, 
            n_steps=n_steps, 
            gamma=gamma,
            lambda_sparse=lambda_sparse, 
            optimizer_fn=torch.optim.Adam,
            optimizer_params=optimizer_params, 
            mask_type=mask_type,
            n_shared=n_shared,
            scheduler_params=dict(
                patience=trial.suggest_int("patienceScheduler", low=5, high=10),
                min_lr=1e-5,
                factor=0.5,
            ),
            scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            verbose=0,
            seed=SEED  # Seed toegevoegd aan TabNet
        )

        clf_tab_opt = TabNetClassifier(**tabnet_params, device_name='cuda' if torch.cuda.is_available() else 'cpu')
        
        # Cross-validation met accuracy als scoring functie
        accuracy_scorer = make_scorer(accuracy_score)  # Gebruik accuracy in plaats van F1-score
        scores = cross_val_score(
            clf_tab_opt, 
            X_train,
            y_train, 
            cv=5, 
            scoring=accuracy_scorer, 
            fit_params={
                'eval_set': [(X_train, y_train)],
                'max_epochs': trial.suggest_int("max_epochs", 10, 50),
                'patience': trial.suggest_int("patience", low=5, high=10),
                'batch_size': 64
            }
        )
        
        # Gemiddelde accuracy berekenen
        accuracy = np.mean(scores)
        return accuracy

    # Optuna logging en study instellen met vaste seed
    optuna.logging.set_verbosity(optuna.logging.DEBUG)
    study_tabnet = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))  # Seed voor Optuna
    study_tabnet.optimize(Objective_tabnet, n_trials=30)

    # Beste parameters opslaan
    TabNet_params = study_tabnet.best_params
    print("Best parameters:", TabNet_params)

    # Definitieve TabNet model parameters
    final_params_tab = dict(
        n_d=TabNet_params['n_da'], 
        n_a=TabNet_params['n_da'], 
        n_steps=TabNet_params['n_steps'], 
        gamma=TabNet_params['gamma'],
        lambda_sparse=TabNet_params['lambda_sparse'], 
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type=TabNet_params['mask_type'], 
        n_shared=TabNet_params['n_shared'],
        scheduler_params=dict(
            patience=TabNet_params['patienceScheduler'],
            min_lr=1e-5,
            factor=0.5,
        ),
        scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
        verbose=0,
        seed=SEED  # Seed toegevoegd aan definitief TabNet model
    )

    # Train het beste model en evalueer
    epochs = TabNet_params['max_epochs']
    clf_tuned_tab = TabNetClassifier(**final_params_tab, device_name='cuda' if torch.cuda.is_available() else 'cpu')
    clf_tuned_tab.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        max_epochs=epochs,
        patience=TabNet_params['patience'],
        batch_size=64
    )

    # Voorspellingen en evaluatie
    y_pred_tabnet = clf_tuned_tab.predict(X_test)

    # Evaluatie en metrics
    evaluate(y_test, y_pred_tabnet, "TabNet")

    # Confusion Matrix en classification report
    cm = confusion_matrix(y_test, y_pred_tabnet)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title("Confusion Matrix: TabNet")
    plt.show()

    class_names = ["class_1", "class_2", "class_3"]
    print(classification_report(y_test, y_pred_tabnet, target_names=class_names))

    # Model opslaan
    torch.save(clf_tuned_tab, 'tabnet_model.pth')
    print("Model saved as 'tabnet_model.pth'")

    return clf_tuned_tab, y_pred_tabnet  # Teruggeven van model en voorspellingen

# Dictionary to store results for each dataset variant
results_dict = {}

# Loop through the dataset variants
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:  # Skip test variants
        continue  
    
    print(f"\nRunning TabNet on dataset variant: {variant_name}")

    # Class distribution in training data
    print("Class distribution in training data:", Counter(y_train))

    # Determine the appropriate test set based on the training variant
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0]
        y_test = datasets['testmet_odds_met_balancing'][1]
        odds_test = X_test[['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0]
        y_test = datasets['testmet_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0]
        y_test = datasets['testzonder_odds_met_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0]
        y_test = datasets['testzonder_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    else:
        print(f"Variant name {variant_name} does not match any known test set, skipping.")
        continue  # If the variant name does not match, skip
    
    # Convert DataFrames to NumPy arrays
    X_train = X_train.values  # Convert training features to NumPy array
    y_train = y_train.values  # Ensure training labels are in the correct format (NumPy array)
    X_test = X_test.values  # Convert test features to NumPy array
    y_test = y_test.values  # Ensure test labels are in the correct format (NumPy array)
    
    # Check if test data is valid
    if X_test is None or y_test is None:
        print(f"Test data for {variant_name} is invalid. Skipping.")
        continue

    # Train the model with the correct datasets
    best_model, y_pred = tune_and_evaluate_tabnet(X_train, y_train, X_test, y_test, odds_test)

    # List to store individual predictions
    variant_results = []
    total_profit = 0

    for i in range(len(y_pred)):
        match_profit = calculate_profit([y_pred[i]], [y_test[i]], [odds_test[i]])
        variant_results.append({
            'Predicted': y_pred[i],
            'Actual': y_test[i],
            'B365H': odds_test[i][0],
            'B365D': odds_test[i][1],
            'B365A': odds_test[i][2],
            'Profit': match_profit
        })
        total_profit += match_profit

    print(f"Total profit for {variant_name}: {total_profit}")

    # Create DataFrame for the current variant and store it in the dictionary
    results_df = pd.DataFrame(variant_results)
    results_dict[f"resultsdf_RF_{variant_name}"] = results_df

    print(f"Data for {variant_name} stored as resultsdf_RF_{variant_name}\n")

# Display all DataFrames in results_dict if needed
for name, df in results_dict.items():
    print(f"\n{name}:\n", df.head())

In [None]:
#CV-5 + N-trails = 30!
import numpy as np
import torch
from sklearn.decomposition import PCA
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score, accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import optuna
import optuna.visualization.matplotlib as optuna_vis
import matplotlib.pyplot as plt
from collections import Counter

# Stel de random seed in voor consistentie
SEED = 42  # Kies een vaste seed voor consistentie
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def evaluate(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    print(f"{model_name} - Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")

def calculate_profit(y_pred, y_test, odds, stake=1):
    profit = 0
    for prediction, actual, odd in zip(y_pred, y_test, odds):
        if prediction == actual:
            if prediction == 2:
                profit += stake * odd[0] - stake  # B365H
            elif prediction == 1:
                profit += stake * odd[1] - stake  # B365D
            elif prediction == 0:
                profit += stake * odd[2] - stake  # B365A
        else:
            profit -= stake
    return profit

def tune_and_evaluate_tabnet(X_train, y_train, X_test, y_test, odds_test):
    """Train and evaluate the TabNet model without PCA."""
    
    def Objective_tabnet(trial):
        mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
        n_da = trial.suggest_int("n_da", 8, 32, step=8)
        n_steps = trial.suggest_int("n_steps", 3, 10, step=1)
        gamma = trial.suggest_float("gamma", 0.01, 0.2, step=0.01)
        n_shared = trial.suggest_int("n_shared", 1, 3)
        lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)

        optimizer_params = dict(lr=2e-2, weight_decay=1e-5)
        tabnet_params = dict(
            n_d=n_da, 
            n_a=n_da, 
            n_steps=n_steps, 
            gamma=gamma,
            lambda_sparse=lambda_sparse, 
            optimizer_fn=torch.optim.Adam,
            optimizer_params=optimizer_params, 
            mask_type=mask_type,
            n_shared=n_shared,
            scheduler_params=dict(
                patience=trial.suggest_int("patienceScheduler", low=5, high=10),
                min_lr=1e-5,
                factor=0.5,
            ),
            scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
            verbose=0,
            seed=SEED  # Seed toegevoegd aan TabNet
        )

        clf_tab_opt = TabNetClassifier(**tabnet_params, device_name='cuda' if torch.cuda.is_available() else 'cpu')
        
        # Cross-validation met accuracy als scoring functie
        accuracy_scorer = make_scorer(accuracy_score)  # Gebruik accuracy in plaats van F1-score
        scores = cross_val_score(
            clf_tab_opt, 
            X_train,
            y_train, 
            cv=5, 
            scoring=accuracy_scorer, 
            fit_params={
                'eval_set': [(X_train, y_train)],
                'max_epochs': trial.suggest_int("max_epochs", 10, 50),
                'patience': trial.suggest_int("patience", low=5, high=10),
                'batch_size': 64
            }
        )
        
        # Gemiddelde accuracy berekenen
        accuracy = np.mean(scores)
        return accuracy

    # Optuna logging en study instellen met vaste seed
    optuna.logging.set_verbosity(optuna.logging.DEBUG)
    study_tabnet = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))  # Seed voor Optuna
    study_tabnet.optimize(Objective_tabnet, n_trials=30)

    # Beste parameters opslaan
    TabNet_params = study_tabnet.best_params
    print("Best parameters:", TabNet_params)

    # Definitieve TabNet model parameters
    final_params_tab = dict(
        n_d=TabNet_params['n_da'], 
        n_a=TabNet_params['n_da'], 
        n_steps=TabNet_params['n_steps'], 
        gamma=TabNet_params['gamma'],
        lambda_sparse=TabNet_params['lambda_sparse'], 
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type=TabNet_params['mask_type'], 
        n_shared=TabNet_params['n_shared'],
        scheduler_params=dict(
            patience=TabNet_params['patienceScheduler'],
            min_lr=1e-5,
            factor=0.5,
        ),
        scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
        verbose=0,
        seed=SEED  # Seed toegevoegd aan definitief TabNet model
    )

    # Train het beste model en evalueer
    epochs = TabNet_params['max_epochs']
    clf_tuned_tab = TabNetClassifier(**final_params_tab, device_name='cuda' if torch.cuda.is_available() else 'cpu')
    clf_tuned_tab.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        max_epochs=epochs,
        patience=TabNet_params['patience'],
        batch_size=64
    )

    # Voorspellingen en evaluatie
    y_pred_tabnet = clf_tuned_tab.predict(X_test)

    # Evaluatie en metrics
    evaluate(y_test, y_pred_tabnet, "Tuned TabNet without PCA")

    # Confusion Matrix en classification report
    cm = confusion_matrix(y_test, y_pred_tabnet)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title("Confusion Matrix: Tuned TabNet without PCA")
    plt.show()

    class_names = ["class_1", "class_2", "class_3"]
    print(classification_report(y_test, y_pred_tabnet, target_names=class_names))

    # Model opslaan
    torch.save(clf_tuned_tab, 'tuned_tabnet_model_without_pca.pth')
    print("Model saved as 'tuned_tabnet_model_without_pca.pth'")

    return clf_tuned_tab, y_pred_tabnet  # Teruggeven van model en voorspellingen

# Dictionary to store results for each dataset variant
results_dict = {}

# Loop through the dataset variants
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:  # Skip test variants
        continue  
    
    print(f"\nRunning TabNet on dataset variant: {variant_name}")

    # Class distribution in training data
    print("Class distribution in training data:", Counter(y_train))

    # Determine the appropriate test set based on the training variant
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0]
        y_test = datasets['testmet_odds_met_balancing'][1]
        odds_test = X_test[['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0]
        y_test = datasets['testmet_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0]
        y_test = datasets['testzonder_odds_met_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0]
        y_test = datasets['testzonder_odds_zonder_balancing'][1]
        odds_test = datasets['testmet_odds_met_balancing'][0][['B365H', 'B365D', 'B365A']].values
    else:
        print(f"Variant name {variant_name} does not match any known test set, skipping.")
        continue  # If the variant name does not match, skip
    
    # Convert DataFrames to NumPy arrays
    X_train = X_train.values  # Convert training features to NumPy array
    y_train = y_train.values  # Ensure training labels are in the correct format (NumPy array)
    X_test = X_test.values  # Convert test features to NumPy array
    y_test = y_test.values  # Ensure test labels are in the correct format (NumPy array)
    
    # Check if test data is valid
    if X_test is None or y_test is None:
        print(f"Test data for {variant_name} is invalid. Skipping.")
        continue

    # Train the model with the correct datasets
    best_model, y_pred = tune_and_evaluate_tabnet(X_train, y_train, X_test, y_test, odds_test)

    # List to store individual predictions
    variant_results = []
    total_profit = 0

    for i in range(len(y_pred)):
        match_profit = calculate_profit([y_pred[i]], [y_test[i]], [odds_test[i]])
        variant_results.append({
            'Predicted': y_pred[i],
            'Actual': y_test[i],
            'B365H': odds_test[i][0],
            'B365D': odds_test[i][1],
            'B365A': odds_test[i][2],
            'Profit': match_profit
        })
        total_profit += match_profit

    print(f"Total profit for {variant_name}: {total_profit}")

    # Create DataFrame for the current variant and store it in the dictionary
    results_df = pd.DataFrame(variant_results)
    results_dict[f"resultsdf_RF_{variant_name}"] = results_df

    print(f"Data for {variant_name} stored as resultsdf_RF_{variant_name}\n")

# Display all DataFrames in results_dict if needed
for name, df in results_dict.items():
    print(f"\n{name}:\n", df.head())

In [None]:
import os

# Verwijder alle afbeeldingen met de extensie .png
for filename in os.listdir():
    if filename.endswith(".png"):
        os.remove(filename)
        print(f"Deleted: {filename}")


In [None]:
import shap
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import numpy as np

feature_names_dict = {
    'trainmet_odds_met_balancing': ['B365H', 'B365D', 'B365A', 'Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FPG', 'Diff_HPG'],
    'trainmet_odds_zonder_balancing': ['B365H', 'B365A', 'Diff_S'],
    'trainzonder_odds_met_balancing': ['Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FTG', 'Diff_FPG', 'Diff_HPG', 'Diff_FRDEF', 'Diff_FRATT'],
    'trainzonder_odds_zonder_balancing': ['Diff_S', 'Diff_ST', 'Diff_C', 'Diff_FRATT']
}

# Hyperparameters voor elk model
predefined_hyperparameters = {
    'trainmet_odds_met_balancing': {'n_estimators': 166, 'max_depth': 5, 'min_samples_split': 7,
                                    'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True},
    'trainmet_odds_zonder_balancing': {'n_estimators': 166, 'max_depth': 5, 'min_samples_split': 7,
                                       'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True},
    'trainzonder_odds_met_balancing': {'n_estimators': 196, 'max_depth': 10, 'min_samples_split': 6,
                                       'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False},
    'trainzonder_odds_zonder_balancing': {'n_estimators': 187, 'max_depth': 3, 'min_samples_split': 5,
                                          'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True}
}

# Functie om de SHAP summary plot voor Class 1 te verbeteren
def generate_class1_summary_plot(model, X_test, feature_names, variant_name):
    # Maak een SHAP explainer
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    
    # Bereken gemiddelde absolute SHAP-waarden voor Class 1
    mean_shap_values_class1 = np.abs(shap_values[1]).mean(axis=0)
    
    # Stel figuurgrootte in voor een consistente breedte en hoogte
    plt.figure(figsize=(9, 6))  # Pas de grootte aan voor de grafiek
    
    # Maak de balkgrafiek
    plt.barh(feature_names, mean_shap_values_class1, color='purple')
    
    # Zet de belangrijkste feature bovenaan
    plt.gca().invert_yaxis()  # Om de belangrijkste feature bovenaan te zetten
    
    # Pas de stijl van de grafiek aan:
    plt.xlabel("Mean(|SHAP value|) (Impact on Model Output Magnitude)", fontsize=12)
    plt.title(f"SHAP Summary (Class 1) - {variant_name}", fontsize=14)
    
    # Verberg de grid en verbeter de layout
    plt.grid(False)  # Verwijder de grid
    plt.gca().spines['top'].set_visible(False)  # Verberg bovenste randlijn
    plt.gca().spines['right'].set_visible(False)  # Verberg rechter randlijn
    plt.gca().spines['left'].set_visible(True)  # Laat linker randlijn staan (voor de y-as)
    plt.gca().spines['bottom'].set_visible(True)  # Laat onderrandlijn staan (voor de x-as)
    
    # Toon de grafiek
    plt.show()  # Toon de grafiek
    plt.savefig(f"RFshap_summary_bar_class1_{variant_name}.png", bbox_inches='tight')  # Sla de grafiek op met hoge resolutie
    plt.close()

# Functie om SHAP-plots te genereren en op te slaan
def generate_and_save_shap_plots(model, X_test, feature_names, variant_name):
    # Maak een SHAP explainer
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    
    # 1. Standaard SHAP-summary-plot
    plt.figure(figsize=(12, 6))  # Stel figuurgrootte in
    shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
    plt.title(f"SHAP Summary Plot - {variant_name}")
    plt.show()
    plt.savefig(f"RF_shap_summary - {variant_name}.png", bbox_inches='tight')
    plt.close()

    # 2. SHAP-summary-plot specifiek voor shap_values[1]
    plt.figure(figsize=(12, 6))  # Stel figuurgrootte in
    shap.summary_plot(shap_values[1], X_test, feature_names=feature_names, show=False)
    plt.title(f"SHAP Summary (Class 1)_{variant_name}")
    plt.show()
    plt.savefig(f"RFshap_summary_class1_{variant_name}.png", bbox_inches='tight')
    plt.close()

# Functie om een model te trainen en alle plots te genereren
def train_and_generate_plots(X_train, y_train, X_test, y_test, variant_name):
    # Haal de hyperparameters op
    rf_params = predefined_hyperparameters[variant_name]
    
    # Train het model
    model = RandomForestClassifier(random_state=42, **rf_params)
    model.fit(X_train, y_train)
    
    # Haal de juiste feature-namen op
    feature_names = feature_names_dict[variant_name]
    
    # Genereer SHAP-plots
    generate_and_save_shap_plots(model, X_test, feature_names, variant_name)
    
    # Genereer de specifieke balkgrafiek voor Class 1
    generate_class1_summary_plot(model, X_test, feature_names, variant_name)
    
    # Voorspelling en evaluatie
    y_pred = model.predict(X_test)
    print(f"Results for {variant_name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"Confusion Matrix RF {variant_name}")
    plt.savefig(f"RFconfusion_matrix_{variant_name}.png", bbox_inches='tight')
    plt.show()
    plt.close()

# Data preparation en model training per variant
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:  # Skip test sets in training loop
        continue
    
    print(f"\nRunning RF on dataset variant: {variant_name}")

    # Selecteer bijbehorende testdataset
    if variant_name == 'trainmet_odds_met_balancing':
        X_test, y_test = datasets['testmet_odds_met_balancing']
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test, y_test = datasets['testmet_odds_zonder_balancing']
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test, y_test = datasets['testzonder_odds_met_balancing']
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test, y_test = datasets['testzonder_odds_zonder_balancing']
    else:
        print(f"Variant {variant_name} heeft geen bijbehorende testset. Skipping.")
        continue

    # Zorg ervoor dat dataframes geschikt zijn
    X_train = pd.DataFrame(X_train) if isinstance(X_train, pd.DataFrame) else pd.DataFrame(X_train, columns=feature_names_dict[variant_name])
    X_test = pd.DataFrame(X_test) if isinstance(X_test, pd.DataFrame) else pd.DataFrame(X_test, columns=feature_names_dict[variant_name])
    
    # Train het model en genereer SHAP-visualisaties
    train_and_generate_plots(X_train, y_train, X_test, y_test, variant_name)



In [None]:
feature_names_dict = {
    'trainmet_odds_met_balancing': ['B365H', 'B365D', 'B365A', 'Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FPG', 'Diff_HPG'],
    'trainmet_odds_zonder_balancing': ['B365H', 'B365A', 'Diff_S'],
    'trainzonder_odds_met_balancing': ['Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FTG', 'Diff_FPG', 'Diff_HPG', 'Diff_FRDEF', 'Diff_FRATT'],
    'trainzonder_odds_zonder_balancing': ['Diff_S', 'Diff_ST', 'Diff_C', 'Diff_FRATT']
}

# Hyperparameters voor RandomForest-model
predefined_hyperparameters = {
    'trainmet_odds_met_balancing': {'n_estimators': 166, 'max_depth': 5, 'min_samples_split': 7,
                                    'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True},
    'trainmet_odds_zonder_balancing': {'n_estimators': 166, 'max_depth': 5, 'min_samples_split': 7,
                                       'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True},
    'trainzonder_odds_met_balancing': {'n_estimators': 196, 'max_depth': 10, 'min_samples_split': 6,
                                       'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False},
    'trainzonder_odds_zonder_balancing': {'n_estimators': 187, 'max_depth': 3, 'min_samples_split': 5,
                                          'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True}
}
# Mapping van variant_names naar beschrijvingen
variant_name_mapping = {
    'trainmet_odds_met_balancing': 'C1',
    'trainmet_odds_zonder_balancing': 'C2',
    'trainzonder_odds_met_balancing': 'C3',
    'trainzonder_odds_zonder_balancing': 'C4'
}


# Functie om feature importance plot voor Random Forest te genereren
def generate_feature_importance_plot_rf(model, feature_names, variant_name):
    plt.figure(figsize=(8, 6))
    
    # Haal de feature importances op
    importances = model.feature_importances_
    
    # Sorteer de importances in dalende volgorde
    indices = importances.argsort()[::1]
    
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices], rotation=0, ha="right", fontsize=10)
    # Maak de grafiek
    importance = plt.barh(range(len(indices)), importances[indices], align="center", color='teal')
    
    plt.xlim(0, 0.50)
    plt.xticks([i * 0.10 for i in range(6)], fontsize=10)  
    
    # Verwijder de rasterlijnen
    plt.grid(False)
    
    for i, value in enumerate(importances[indices]):
        plt.text(value + 0.01, i, f"{value:.2f}", va='center', fontsize=9, color='black')
    
    # Verwijder de bovenste en rechter grafieklijnen
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    
    # Grafiektitel en labels
    plt.title(f"Feature Importance - Random Forest", fontsize=14)
    plt.xlabel("Importance", fontsize=12)
    plt.ylabel("Feature", fontsize=12)
    
    plt.tight_layout()
    plt.savefig(f"RF_Feature_Importance_{variant_name}.png", bbox_inches='tight')
    plt.show()
    plt.close()
    
# Functie om een Random Forest-model te trainen en de feature importance plot te genereren
def train_and_generate_plots_rf(X_train, y_train, X_test, y_test, variant_name):
    # Haal de hyperparameters op
    rf_params = predefined_hyperparameters[variant_name]
    
    # Train het model
    model = RandomForestClassifier(random_state=42, **rf_params)
    model.fit(X_train, y_train)
    
    # Haal de juiste feature-namen op
    feature_names = feature_names_dict[variant_name]

    readable_name = variant_name_mapping.get(variant_name, variant_name)
    
    # Genereer de Random Forest Feature Importance Plot
    generate_feature_importance_plot_rf(model, feature_names, variant_name)
    
    # Voorspelling en evaluatie
    y_pred = model.predict(X_test)
    print(f"Results for {variant_name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"Confusion Matrix Random Forest - {readable_name}")
    plt.savefig(f"RF_confusion_matrix_{variant_name}.png", bbox_inches='tight')
    plt.show()
    plt.close()

# Data preparation en model training per variant
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:  # Skip test sets in training loop
        continue
    
    print(f"\nRunning Random Forest on dataset variant: {variant_name}")

    # Selecteer bijbehorende testdataset
    if variant_name == 'trainmet_odds_met_balancing':
        X_test, y_test = datasets['testmet_odds_met_balancing']
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test, y_test = datasets['testmet_odds_zonder_balancing']
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test, y_test = datasets['testzonder_odds_met_balancing']
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test, y_test = datasets['testzonder_odds_zonder_balancing']
    else:
        print(f"Variant {variant_name} heeft geen bijbehorende testset. Skipping.")
        continue

    # Zorg ervoor dat dataframes geschikt zijn
    X_train = pd.DataFrame(X_train) if isinstance(X_train, pd.DataFrame) else pd.DataFrame(X_train, columns=feature_names_dict[variant_name])
    X_test = pd.DataFrame(X_test) if isinstance(X_test, pd.DataFrame) else pd.DataFrame(X_test, columns=feature_names_dict[variant_name])
    
    # Train het model en genereer plots
    train_and_generate_plots_rf(X_train, y_train, X_test, y_test, variant_name)




In [None]:
variant_name_mapping = {
    'trainmet_odds_met_balancing': 'C1',
    'trainmet_odds_zonder_balancing': 'C2',
    'trainzonder_odds_met_balancing': 'C3',
    'trainzonder_odds_zonder_balancing': 'C4'
}

# Handmatig ingestelde feature-namen per dataset
feature_names_dict = {
    'trainmet_odds_met_balancing': ['B365H', 'B365D', 'B365A', 'Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FPG', 'Diff_HPG'],
    'trainmet_odds_zonder_balancing': ['B365H', 'B365A', 'Diff_S'],
    'trainzonder_odds_met_balancing': ['Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FTG', 'Diff_FPG', 'Diff_HPG', 'Diff_FRDEF', 'Diff_FRATT'],
    'trainzonder_odds_zonder_balancing': ['Diff_S', 'Diff_ST', 'Diff_C', 'Diff_FRATT']
}

# Hyperparameters voor elk model
hyperparameters_dict = {
    'trainmet_odds_met_balancing': {
        'n_estimators': 129, 'learning_rate': 0.049876021595567685, 'max_depth': 10, 'subsample': 0.7849527378148702,
        'min_child_weight': 2, 'colsample_bytree': 0.6120216069637563
    },
    'trainmet_odds_zonder_balancing': {
        'n_estimators': 142, 'learning_rate': 0.014241138185489662, 'max_depth': 5, 'subsample': 0.6020914931801977,
        'min_child_weight': 7, 'colsample_bytree': 0.5505854790099324
    },
    'trainzonder_odds_met_balancing': {
        'n_estimators': 144, 'learning_rate': 0.018046202637800254, 'max_depth': 10, 'subsample': 0.7572792965555748,
        'min_child_weight': 1, 'colsample_bytree': 0.90717604919467
    },
    'trainzonder_odds_zonder_balancing': {
        'n_estimators': 176, 'learning_rate': 0.00516504117880041, 'max_depth': 6, 'subsample': 0.5260425851574494,
        'min_child_weight': 7, 'colsample_bytree': 0.6163624968902534
    }
}

# Functie om feature importance plot te genereren
def generate_feature_importance_plot(model, feature_names, variant_name):
    # Haal de feature importances van het XGBoost model
    importances = model.get_booster().get_score(importance_type='weight')
    
    # Sorteer de importances van hoog naar laag
    sorted_importances = sorted(importances.items(), key=lambda x: x[1], reverse=True)
    sorted_features = [item[0] for item in sorted_importances]
    sorted_values = [item[1] for item in sorted_importances]
    
    # Normaliseer de importances zodat ze tussen 0 en 1 liggen
    total_importance = sum(sorted_values)
    normalized_values = [value / total_importance for value in sorted_values]
    
    # Maak de plot
    plt.figure(figsize=(8, 6))
    plt.barh(range(len(normalized_values)-1, -1, -1), normalized_values, align="center", color='teal')
    
    # Stel de juiste feature-namen in
    plt.yticks(range(len(normalized_values)), 
               [feature_names[int(feat[1:])] for feat in sorted_features], 
               rotation=0, ha="right", fontsize=10)
    
    # Instellen van de x-as limieten en ticks
    plt.xlim(0, 0.50)
    plt.xticks([i * 0.10 for i in range(6)], fontsize=10)
    
    # Voeg waarden toe aan de balken
    for i, value in enumerate(normalized_values[::-1]):
        plt.text(value + 0.01, i, f"{value:.2f}", va='center', fontsize=9, color='black')
    
    # Verwijder rasterlijnen en overbodige assen
    plt.grid(False)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    
    # Grafiektitel en labels
    plt.title(f"XGBoost Feature Importance", fontsize=14)
    plt.xlabel("Importance", fontsize=12)
    plt.ylabel("Feature", fontsize=12)
    
    # Pas layout aan en toon de plot
    plt.tight_layout()
    plt.savefig(f"XGFeature_Importance_{variant_name}.png", bbox_inches='tight')
    plt.show()
    plt.close()



# Functie om een model te trainen en alle plots te genereren
def train_and_generate_plots(X_train, y_train, X_test, y_test, variant_name):
    # Haal de hyperparameters op
    params = hyperparameters_dict[variant_name]
    
    # Train het model
    model = XGBClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    
    # Haal de juiste feature-namen op
    feature_names = feature_names_dict[variant_name]

    readable_name = variant_name_mapping.get(variant_name, variant_name)
    
    # Genereer de feature importance plot
    generate_feature_importance_plot(model, feature_names, variant_name)
    
    # Voorspelling en evaluatie
    y_pred = model.predict(X_test)
    print(f"Results for {variant_name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"Confusion Matrix XG-Boost - {readable_name}")
    plt.savefig(f"XG confusion_matrix_{variant_name}.png", bbox_inches='tight')
    plt.show()
    plt.close()

# Loop door de datasets
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:
        continue  # Skip testsets
    
    print(f"\nRunning XGBoost for {variant_name}")
    
    # Selecteer de juiste testset
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0]
        y_test = datasets['testmet_odds_met_balancing'][1]
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0]
        y_test = datasets['testmet_odds_zonder_balancing'][1]
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0]
        y_test = datasets['testzonder_odds_met_balancing'][1]
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0]
        y_test = datasets['testzonder_odds_zonder_balancing'][1]
    else:
        print(f"Unknown variant name {variant_name}, skipping.")
        continue

    # Zorg ervoor dat de data in de juiste vorm is
    X_train = X_train.values
    y_train = y_train.values
    X_test = X_test.values
    y_test = y_test.values
    
    # Train het model en genereer de plots
    train_and_generate_plots(X_train, y_train, X_test, y_test, variant_name)


In [None]:
import shap
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import numpy as np

# Handmatig ingestelde feature-namen per dataset
feature_names_dict = {
    'trainmet_odds_met_balancing': ['B365H', 'B365D', 'B365A', 'Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FPG', 'Diff_HPG'],
    'trainmet_odds_zonder_balancing': ['B365H', 'B365A', 'Diff_S'],
    'trainzonder_odds_met_balancing': ['Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FTG', 'Diff_FPG', 'Diff_HPG', 'Diff_FRDEF', 'Diff_FRATT'],
    'trainzonder_odds_zonder_balancing': ['Diff_S', 'Diff_ST', 'Diff_C', 'Diff_FRATT']
}

# Hyperparameters voor elk model
hyperparameters_dict = {
    'trainmet_odds_met_balancing': {
        'n_estimators': 129, 'learning_rate': 0.049876021595567685, 'max_depth': 10, 'subsample': 0.7849527378148702,
        'min_child_weight': 2, 'colsample_bytree': 0.6120216069637563
    },
    'trainmet_odds_zonder_balancing': {
        'n_estimators': 142, 'learning_rate': 0.014241138185489662, 'max_depth': 5, 'subsample': 0.6020914931801977,
        'min_child_weight': 7, 'colsample_bytree': 0.5505854790099324
    },
    'trainzonder_odds_met_balancing': {
        'n_estimators': 144, 'learning_rate': 0.018046202637800254, 'max_depth': 10, 'subsample': 0.7572792965555748,
        'min_child_weight': 1, 'colsample_bytree': 0.90717604919467
    },
    'trainzonder_odds_zonder_balancing': {
        'n_estimators': 176, 'learning_rate': 0.00516504117880041, 'max_depth': 6, 'subsample': 0.5260425851574494,
        'min_child_weight': 7, 'colsample_bytree': 0.6163624968902534
    }
}

# Functie om de SHAP summary plot voor Class 1 te verbeteren
def generate_class1_summary_plot(model, X_test, feature_names, variant_name):
    # Maak een SHAP explainer
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    
    # Bereken gemiddelde absolute SHAP-waarden voor Class 1
    mean_shap_values_class1 = np.abs(shap_values[1]).mean(axis=0)
    
    # Stel figuurgrootte in voor een consistente breedte en hoogte
    plt.figure(figsize=(9, 6))  # Pas de grootte aan voor de grafiek
    
    # Maak de balkgrafiek
    plt.barh(feature_names, mean_shap_values_class1, color='purple')
    
    # Zet de belangrijkste feature bovenaan
    plt.gca().invert_yaxis()  # Om de belangrijkste feature bovenaan te zetten
    
    # Pas de stijl van de grafiek aan:
    plt.xlabel("Mean(|SHAP value|) (Impact on Model Output Magnitude)", fontsize=12)
    plt.title(f"SHAP Summary (Class 1) - {variant_name}", fontsize=14)
    
    # Verberg de grid en verbeter de layout
    plt.grid(False)  # Verwijder de grid
    plt.gca().spines['top'].set_visible(False)  # Verberg bovenste randlijn
    plt.gca().spines['right'].set_visible(False)  # Verberg rechter randlijn
    plt.gca().spines['left'].set_visible(True)  # Laat linker randlijn staan (voor de y-as)
    plt.gca().spines['bottom'].set_visible(True)  # Laat onderrandlijn staan (voor de x-as)
    
    # Toon de grafiek
    plt.show()  # Toon de grafiek
    plt.savefig(f"XGshap_summary_bar_class1_{variant_name}.png", bbox_inches='tight')  # Sla de grafiek op met hoge resolutie
    plt.close()



# Functie om SHAP-plots te genereren en op te slaan
def generate_and_save_shap_plots(model, X_test, feature_names, variant_name):
    # Maak een SHAP explainer
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)
    
    # 1. Standaard SHAP-summary-plot
    plt.figure(figsize=(12, 6))  # Stel figuurgrootte in
    shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
    plt.title(f"SHAP Summary Plot - {variant_name}")
    plt.show()
    plt.savefig(f"XGshap_summary_{variant_name}.png", bbox_inches='tight')
    plt.close()

    # 2. SHAP-summary-plot specifiek voor shap_values[1]
    plt.figure(figsize=(12, 6))  # Stel figuurgrootte in
    shap.summary_plot(shap_values[1], X_test, feature_names=feature_names, show=False)
    plt.title(f"SHAP Summary (Class 1) - {variant_name}")
    plt.show()
    plt.savefig(f"XGshap_summary_class1_{variant_name}.png", bbox_inches='tight')
    plt.close()

# Functie om een model te trainen en alle plots te genereren
def train_and_generate_plots(X_train, y_train, X_test, y_test, variant_name):
    # Haal de hyperparameters op
    params = hyperparameters_dict[variant_name]
    
    # Train het model
    model = XGBClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    
    # Haal de juiste feature-namen op
    feature_names = feature_names_dict[variant_name]
    
    # Genereer SHAP-plots
    generate_and_save_shap_plots(model, X_test, feature_names, variant_name)
    
    # Genereer de specifieke balkgrafiek voor Class 1
    generate_class1_summary_plot(model, X_test, feature_names, variant_name)
    
    # Voorspelling en evaluatie
    y_pred = model.predict(X_test)
    print(f"Results for {variant_name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"XGConfusion Matrix for {variant_name}")
    plt.show()
    plt.savefig(f"confusion_matrix_{variant_name}.png", bbox_inches='tight')
    plt.close()

# Loop door de datasets
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:
        continue  # Skip testsets
    
    print(f"\nRunning XGBoost for {variant_name}")
    
    # Selecteer de juiste testset
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0]
        y_test = datasets['testmet_odds_met_balancing'][1]
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0]
        y_test = datasets['testmet_odds_zonder_balancing'][1]
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0]
        y_test = datasets['testzonder_odds_met_balancing'][1]
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0]
        y_test = datasets['testzonder_odds_zonder_balancing'][1]
    else:
        print(f"Unknown variant name {variant_name}, skipping.")
        continue

    # Zorg ervoor dat de data in de juiste vorm is
    X_train = X_train.values
    y_train = y_train.values
    X_test = X_test.values
    y_test = y_test.values
    
    # Train het model en genereer de plots
    train_and_generate_plots(X_train, y_train, X_test, y_test, variant_name)


In [None]:
# Stel de random seed in voor consistentie
SEED = 42  # Kies een vaste seed voor consistentie
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Mapping van variant_names naar beschrijvingen
variant_name_mapping = {
    'trainmet_odds_met_balancing': 'C1',
    'trainmet_odds_zonder_balancing': 'C2',
    'trainzonder_odds_met_balancing': 'C3',
    'trainzonder_odds_zonder_balancing': 'C4'
}

feature_names_dict = {
    'trainmet_odds_met_balancing': ['B365H', 'B365D', 'B365A', 'Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FPG', 'Diff_HPG'],
    'trainmet_odds_zonder_balancing': ['B365H', 'B365A', 'Diff_S'],
    'trainzonder_odds_met_balancing': ['Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FTG', 'Diff_FPG', 'Diff_HPG', 'Diff_FRDEF', 'Diff_FRATT'],
    'trainzonder_odds_zonder_balancing': ['Diff_S', 'Diff_ST', 'Diff_C', 'Diff_FRATT']
}

# Definieer de ideale hyperparameters voor de TabNet-modellen
tabnet_hyperparameters = {
    'trainmet_odds_met_balancing': {
        'mask_type': 'sparsemax', 'n_da': 24, 'n_steps': 8, 'gamma': 0.02, 'n_shared': 2, 'lambda_sparse': 7.891392918016076e-05, 'patienceScheduler': 9, 'max_epochs': 50, 'patience': 7
    },
    'trainmet_odds_zonder_balancing': {
        'mask_type': 'entmax', 'n_da': 16, 'n_steps': 5, 'gamma': 0.1, 'n_shared': 3,
        'lambda_sparse': 0.00035632918778922634, 'patienceScheduler': 8, 'max_epochs': 29, 'patience': 5
    },
    'trainzonder_odds_met_balancing': {
        'mask_type': 'sparsemax', 'n_da': 24, 'n_steps': 8, 'gamma': 0.01, 'n_shared': 1,
        'lambda_sparse': 5.097139359389593e-05, 'patienceScheduler': 10, 'max_epochs': 50, 'patience': 10
    },
    'trainzonder_odds_zonder_balancing': {
        'mask_type': 'sparsemax', 'n_da': 16, 'n_steps': 3, 'gamma': 0.060000000000000005, 'n_shared': 1, 'lambda_sparse': 0.0002059204000149221, 'patienceScheduler': 10, 'max_epochs': 16, 'patience': 9
    }
}

# Functie om het model te laden en de confusion matrix te genereren voor TabNet
def load_and_evaluate_tabnet_confusion_matrix(X_train, y_train, X_test, y_test, variant_name):
    # Haal de hyperparameters op voor de huidige variant
    params = tabnet_hyperparameters[variant_name]
    
    # Laad het TabNet model met de opgegeven hyperparameters
    tabnet_model = TabNetClassifier(
        n_d=params['n_da'],
        n_a=params['n_da'],
        n_steps=params['n_steps'],
        gamma=params['gamma'],
        lambda_sparse=params['lambda_sparse'],
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type=params['mask_type'],
        n_shared=params['n_shared'],
        scheduler_params=dict(
            patience=params['patienceScheduler'],
            min_lr=1e-5,
            factor=0.5,
        ),
        scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
        verbose=0,
        seed=SEED,  # Seed toegevoegd
    )

    # Train het model met de opgegeven hyperparameters
    tabnet_model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        max_epochs=params['max_epochs'],
        patience=params['patience'],
        batch_size=64
    )

    # Voorspellingen doen
    feature_names = feature_names_dict[variant_name]
    
    y_pred_tabnet = tabnet_model.predict(X_test)
    
    importances = tabnet_model.feature_importances_
    
    indices = importances.argsort()[::1]
    
    plt.figure(figsize=(8, 6))
    plt.xlim(0, 0.50)
    plt.xticks([i * 0.10 for i in range(6)], fontsize=10)  
    
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices], rotation=0, ha="right", fontsize=10)
    for i, value in enumerate(importances[indices]):
        plt.text(value + 0.01, i, f"{value:.2f}", va='center', fontsize=9, color='black')
    # Maak de grafiek
    importance = plt.barh(range(len(indices)), importances[indices], align="center", color='teal')
    
    # Verwijder de rasterlijnen
    plt.grid(False)
    
    # Verwijder de bovenste en rechter grafieklijnen
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
      # Verwijder getallen op de x-as
    
    # Grafiektitel en labels
    plt.title(f"Feature Importance - TabNet", fontsize=14)
    plt.xlabel("Importance", fontsize=12)
    plt.ylabel("Feature", fontsize=12)
    
    plt.tight_layout()
    plt.savefig(f"Tabnet_Feature_Importance_{variant_name}.png", bbox_inches='tight')
    plt.show()
    plt.close()

    readable_name = variant_name_mapping.get(variant_name, variant_name)

    # Evaluatie en metrics (enkel confusion matrix hier)
    cm = confusion_matrix(y_test, y_pred_tabnet)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"Confusion Matrix TabNet - {readable_name}")
    plt.savefig(f"Tabnet_ConfusionMatrix{variant_name}.png", bbox_inches='tight')
    plt.show()

    class_names = ["class_1", "class_2", "class_3"]
    print(f"Confusion Matrix for {variant_name}:")
    print(cm)

# Voor elke dataset variant (alleen confusion matrix zonder SHAP plots)
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:  # Skip testsets
        continue

    print(f"\nRunning TabNet on dataset variant: {variant_name}")

    # Bepaal het testset op basis van de training variant
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0]
        y_test = datasets['testmet_odds_met_balancing'][1]
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0]
        y_test = datasets['testmet_odds_zonder_balancing'][1]
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0]
        y_test = datasets['testzonder_odds_met_balancing'][1]
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0]
        y_test = datasets['testzonder_odds_zonder_balancing'][1]
    else:
        print(f"Unknown variant name {variant_name}, skipping.")
        continue

    # Zorg ervoor dat de data in de juiste vorm is (arrays)
    X_train = X_train.values
    y_train = y_train.values
    X_test = X_test.values
    y_test = y_test.values

    # Train model en genereer alleen confusion matrix
    load_and_evaluate_tabnet_confusion_matrix(X_train, y_train, X_test, y_test, variant_name)



In [None]:
# Stel de random seed in voor consistentie
SEED = 42  # Kies een vaste seed voor consistentie
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

feature_names_dict = {
    'trainmet_odds_met_balancing': ['B365H', 'B365D', 'B365A', 'Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FPG', 'Diff_HPG'],
    'trainmet_odds_zonder_balancing': ['B365H', 'B365A', 'Diff_S'],
    'trainzonder_odds_met_balancing': ['Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FTG', 'Diff_FPG', 'Diff_HPG', 'Diff_FRDEF', 'Diff_FRATT'],
    'trainzonder_odds_zonder_balancing': ['Diff_S', 'Diff_ST', 'Diff_C', 'Diff_FRATT']
}

# Definieer de ideale hyperparameters voor de TabNet-modellen
tabnet_hyperparameters = {
    'trainmet_odds_met_balancing': {
        'mask_type': 'sparsemax', 'n_da': 24, 'n_steps': 8, 'gamma': 0.02, 'n_shared': 2, 'lambda_sparse': 7.891392918016076e-05, 'patienceScheduler': 9, 'max_epochs': 50, 'patience': 7
    },
    'trainmet_odds_zonder_balancing': {
        'mask_type': 'entmax', 'n_da': 16, 'n_steps': 5, 'gamma': 0.1, 'n_shared': 3,
        'lambda_sparse': 0.00035632918778922634, 'patienceScheduler': 8, 'max_epochs': 29, 'patience': 5
    },
    'trainzonder_odds_met_balancing': {
        'mask_type': 'sparsemax', 'n_da': 24, 'n_steps': 8, 'gamma': 0.01, 'n_shared': 1,
        'lambda_sparse': 5.097139359389593e-05, 'patienceScheduler': 10, 'max_epochs': 50, 'patience': 10
    },
    'trainzonder_odds_zonder_balancing': {
        'mask_type': 'sparsemax', 'n_da': 16, 'n_steps': 3, 'gamma': 0.060000000000000005, 'n_shared': 1, 'lambda_sparse': 0.0002059204000149221, 'patienceScheduler': 10, 'max_epochs': 16, 'patience': 9
    }
}

# Functie om een SHAP-plot te genereren voor het TabNet model


def generate_shap_plot_tabnet(model, X_test, feature_names, variant_name):
    # Gebruik de predict functie van TabNet om voorspellingen te krijgen
    explainer = shap.Explainer(model.predict, X_test)  # Gebruik de predict functie van het model
    shap_values = explainer(X_test)
    
    # Print shapes voor debugging
    print("SHAP values shape:", shap_values.values.shape)
    print("X_test shape:", X_test.shape)
    
    # 1. Standaard SHAP samenvattingsplot voor alle klassen
    plt.figure(figsize=(12, 6))  # Zet de figuurgrootte
    shap.summary_plot(shap_values.values, X_test, feature_names=feature_names, show=False)
    plt.title(f"SHAP Summary Plot - {variant_name}")
    plt.show()
    plt.savefig(f"TabNet_shap_summary_{variant_name}.png", bbox_inches='tight')
    plt.close()
    
     # 1. Standaard SHAP-summary-plot
    plt.figure(figsize=(12, 6))  # Stel figuurgrootte in
    shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
    plt.title(f"SHAP Summary Plot - {variant_name}")
    plt.show()
    plt.savefig(f"XGshap_summary_{variant_name}.png", bbox_inches='tight')
    plt.close()
    
    # 2
    plt.figure(figsize=(12, 6))  # Stel figuurgrootte in
    shap.summary_plot(shap_values.values[1], X_test, feature_names=feature_names, show=False)
    plt.title(f"SHAP Summary (Class 1) - {variant_name}")
    plt.show()
    plt.savefig(f"XGshap_summary_class1_{variant_name}.png", bbox_inches='tight')
    plt.close()

        
# Functie om het model te laden en SHAP-plot te genereren voor TabNet
def load_and_evaluate_tabnet(X_train, y_train, X_test, y_test, feature_names, variant_name):
    # Haal de hyperparameters op voor de huidige variant
    params = tabnet_hyperparameters[variant_name]
    
    # Laad het TabNet model met de opgegeven hyperparameters
    tabnet_model = TabNetClassifier(
        n_d=params['n_da'], 
        n_a=params['n_da'], 
        n_steps=params['n_steps'], 
        gamma=params['gamma'],
        lambda_sparse=params['lambda_sparse'], 
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type=params['mask_type'], 
        n_shared=params['n_shared'],
        scheduler_params=dict(
            patience=params['patienceScheduler'],
            min_lr=1e-5,
            factor=0.5,
        ),
        scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
        verbose=0,
        seed=SEED,  # Seed toegevoegd
    )
    
    # Train het model met de opgegeven hyperparameters
    tabnet_model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        max_epochs=params['max_epochs'],
        patience=params['patience'],
        batch_size=64
    )
    
    # Voorspellingen doen
    y_pred_tabnet = tabnet_model.predict(X_test)
    
    # Genereer SHAP plots
    generate_shap_plot_tabnet(tabnet_model, X_test, feature_names, variant_name)
    
    # Confusion Matrix en classification report
    cm = confusion_matrix(y_test, y_pred_tabnet)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"Confusion Matrix: Tuned TabNet for {variant_name}")
    plt.show()

    class_names = ["class_1", "class_2", "class_3"]
    print(classification_report(y_test, y_pred_tabnet, target_names=class_names))

# Voor elke dataset variant
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:  # Skip testsets
        continue  

    print(f"\nRunning TabNet on dataset variant: {variant_name}")

    # Bepaal het testset op basis van de training variant
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0].head(5)  # Gebruik alleen de eerste 5 instances
        y_test = datasets['testmet_odds_met_balancing'][1].head(5)  # Gebruik alleen de eerste 5 instances
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0].head(5)  # Gebruik alleen de eerste 5 instances
        y_test = datasets['testmet_odds_zonder_balancing'][1].head(5)  # Gebruik alleen de eerste 5 instances
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0].head(5)  # Gebruik alleen de eerste 5 instances
        y_test = datasets['testzonder_odds_met_balancing'][1].head(5)  # Gebruik alleen de eerste 5 instances
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0].head(5)  # Gebruik alleen de eerste 5 instances
        y_test = datasets['testzonder_odds_zonder_balancing'][1].head(5)  # Gebruik alleen de eerste 5 instances
    
    X_train = X_train.values
    y_train = y_train.values
    X_test = X_test.values
    y_test = y_test.values

    # Run de evaluatie voor TabNet
    load_and_evaluate_tabnet(X_train, y_train, X_test, y_test, feature_names_dict[variant_name], variant_name)


In [None]:
# Define the hyperparameters for the TabNet models
import shap
import matplotlib.pyplot as plt
import torch
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay

# Stel de random seed in voor consistentie
SEED = 42  # Kies een vaste seed voor consistentie
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

feature_names_dict = {
    'trainmet_odds_met_balancing': ['B365H', 'B365D', 'B365A', 'Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FPG', 'Diff_HPG'],
    'trainmet_odds_zonder_balancing': ['B365H', 'B365A', 'Diff_S'],
    'trainzonder_odds_met_balancing': ['Diff_S', 'Diff_ST', 'Diff_F', 'Diff_C', 'Diff_Y', 'Diff_FTG', 'Diff_FPG', 'Diff_HPG', 'Diff_FRDEF', 'Diff_FRATT'],
    'trainzonder_odds_zonder_balancing': ['Diff_S', 'Diff_ST', 'Diff_C', 'Diff_FRATT']
}

feature_names = feature_names_dict[variant_name]
print(f"Feature names for {variant_name}: {feature_names}")

# Definieer de ideale hyperparameters voor de TabNet-modellen
tabnet_hyperparameters = {
    'trainmet_odds_met_balancing': {
        'mask_type': 'sparsemax', 'n_da': 24, 'n_steps': 8, 'gamma': 0.02, 'n_shared': 2, 'lambda_sparse': 7.891392918016076e-05, 'patienceScheduler': 9, 'max_epochs': 50, 'patience': 7
    },
    'trainmet_odds_zonder_balancing': {
        'mask_type': 'entmax', 'n_da': 16, 'n_steps': 5, 'gamma': 0.1, 'n_shared': 3,
        'lambda_sparse': 0.00035632918778922634, 'patienceScheduler': 8, 'max_epochs': 29, 'patience': 5
    },
    'trainzonder_odds_met_balancing': {
        'mask_type': 'sparsemax', 'n_da': 24, 'n_steps': 8, 'gamma': 0.01, 'n_shared': 1,
        'lambda_sparse': 5.097139359389593e-05, 'patienceScheduler': 10, 'max_epochs': 50, 'patience': 10
    },
    'trainzonder_odds_zonder_balancing': {
        'mask_type': 'sparsemax', 'n_da': 16, 'n_steps': 3, 'gamma': 0.060000000000000005, 'n_shared': 1, 'lambda_sparse': 0.0002059204000149221, 'patienceScheduler': 10, 'max_epochs': 16, 'patience': 9
    }
}


# Functie om een SHAP-plot te genereren voor het TabNet model
def generate_shap_plot_tabnet(model, X_test, feature_names, variant_name):
    # Gebruik de predict functie van TabNet om voorspellingen te krijgen
    explainer = shap.Explainer(model.predict, X_test)  # Gebruik de predict functie van het model
    shap_values = explainer(X_test)

    # 1. Standaard SHAP-summary-plot
    plt.figure(figsize=(12, 6))  # Stel figuurgrootte in
    shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
    plt.title(f"SHAP Summary Plot - {variant_name}")
    plt.show()
    plt.savefig(f"TabNet_shap_summary_{variant_name}.png", bbox_inches='tight')
    plt.close()

    # 2. SHAP-summary-plot specifiek voor shap_values[1] (Class 1)
    plt.figure(figsize=(12, 6))  # Stel figuurgrootte in
    shap.summary_plot(shap_values[1], X_test, feature_names=feature_names, show=False)
    plt.title(f"SHAP Summary (Class 1) - {variant_name}")
    plt.show()
    plt.savefig(f"TabNet_shap_summary_class1_{variant_name}.png", bbox_inches='tight')
    plt.close()

    # 3. SHAP-balkgrafiek voor gemiddelde absolute SHAP-waarde voor Class 1
    mean_shap_values_class1 = np.abs(shap_values[1].values).mean(axis=0)  # Gemiddelde absolute SHAP-waarde
    plt.figure(figsize=(9, 6))  # Pas de grootte aan voor de grafiek
    plt.barh(feature_names, mean_shap_values_class1, color='purple')
    plt.gca().invert_yaxis()  # Zet de belangrijkste feature bovenaan
    plt.xlabel("Mean(|SHAP value|) (Impact on Model Output Magnitude)", fontsize=12)
    plt.title(f"SHAP Summary (Class 1) - {variant_name}", fontsize=14)
    plt.grid(False)  # Verwijder de grid
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['bottom'].set_visible(True)
    plt.show()
    plt.savefig(f"TabNet_shap_summary_bar_class1_{variant_name}.png", bbox_inches='tight')
    plt.close()

# Functie om het model te laden en SHAP-plot te genereren voor TabNet
def load_and_evaluate_tabnet(X_train, y_train, X_test, y_test, feature_names, variant_name):
    # Haal de hyperparameters op voor de huidige variant
    params = tabnet_hyperparameters[variant_name]
    
    # Laad het TabNet model met de opgegeven hyperparameters
    tabnet_model = TabNetClassifier(
        n_d=params['n_da'], 
        n_a=params['n_da'], 
        n_steps=params['n_steps'], 
        gamma=params['gamma'],
        lambda_sparse=params['lambda_sparse'], 
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type=params['mask_type'], 
        n_shared=params['n_shared'],
        scheduler_params=dict(
            patience=params['patienceScheduler'],
            min_lr=1e-5,
            factor=0.5,
        ),
        scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
        verbose=0,
        seed=SEED,  # Seed toegevoegd
    )
    
    # Train het model met de opgegeven hyperparameters
    tabnet_model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        max_epochs=params['max_epochs'],
        patience=params['patience'],
        batch_size=64
    )
    
    # Voorspellingen doen
    y_pred_tabnet = tabnet_model.predict(X_test)
    
    # Genereer SHAP plots
    generate_shap_plot_tabnet(tabnet_model, X_test, feature_names, variant_name)
    
    # Evaluatie en metrics
    accuracy = accuracy_score(y_test, y_pred_tabnet)
    f1 = f1_score(y_test, y_pred_tabnet, average='macro')
    print(f"{variant_name} - Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")
    
    # Confusion Matrix en classification report
    cm = confusion_matrix(y_test, y_pred_tabnet)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"Confusion Matrix: Tuned TabNet for {variant_name}")
    plt.show()

    class_names = ["class_1", "class_2", "class_3"]
    print(classification_report(y_test, y_pred_tabnet, target_names=class_names))

# Voor elke dataset variant
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:  # Skip testsets
        continue  

    print(f"\nRunning TabNet on dataset variant: {variant_name}")

    # Bepaal het testset op basis van de training variant
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0]
        y_test = datasets['testmet_odds_met_balancing'][1]
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0]
        y_test = datasets['testmet_odds_zonder_balancing'][1]
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0]
        y_test = datasets['testzonder_odds_met_balancing'][1]
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0]
        y_test = datasets['testzonder_odds_zonder_balancing'][1]
    else:
        print(f"Unknown variant name {variant_name}, skipping.")
        continue

    # Zorg ervoor dat de data in de juiste vorm is (arrays)
    X_train = X_train.values
    y_train = y_train.values
    X_test = X_test.values
    y_test = y_test.values

    # Train model en genereer SHAP plot
    load_and_evaluate_tabnet(X_train, y_train, X_test, y_test, feature_names, variant_name)


In [None]:
import shap
import matplotlib.pyplot as plt
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import numpy as np

# Stel de random seed in voor consistentie
SEED = 42  # Kies een vaste seed voor consistentie
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Definieer de ideale hyperparameters voor de TabNet-modellen
tabnet_hyperparameters = {
    'trainmet_odds_met_balancing': {
        'mask_type': 'sparsemax', 'n_da': 24, 'n_steps': 8, 'gamma': 0.02, 'n_shared': 2,
        'lambda_sparse': 7.891392918016076e-05, 'patienceScheduler': 9, 'max_epochs': 50, 'patience': 7
    },
    'trainmet_odds_zonder_balancing': {
        'mask_type': 'entmax', 'n_da': 16, 'n_steps': 5, 'gamma': 0.1, 'n_shared': 3,
        'lambda_sparse': 0.00035632918778922634, 'patienceScheduler': 8, 'max_epochs': 29, 'patience': 5
    },
    'trainzonder_odds_met_balancing': {
        'mask_type': 'sparsemax', 'n_da': 24, 'n_steps': 8, 'gamma': 0.01, 'n_shared': 1,
        'lambda_sparse': 5.097139359389593e-05, 'patienceScheduler': 10, 'max_epochs': 50, 'patience': 10
    },
    'trainzonder_odds_zonder_balancing': {
      'mask_type': 'sparsemax', 'n_da': 16, 'n_steps': 3, 'gamma': 0.060000000000000005, 'n_shared': 1, 'lambda_sparse': 0.0002059204000149221, 'patienceScheduler': 10, 'max_epochs': 16, 'patience': 9
    }
}

# Functie om een SHAP-plot te genereren voor het TabNet model
def generate_shap_plot_tabnet(model, X_test):
    # Gebruik de predict functie van TabNet om voorspellingen te krijgen
    explainer = shap.Explainer(model.predict, X_test)  # Gebruik de predict functie van het model
    shap_values = explainer(X_test)
    shap.summary_plot(shap_values, X_test)

# Functie voor evaluatie (Accuracy)
def evaluate(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    print(f"{model_name} - Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")

# Functie om het model te laden en SHAP-plot te genereren voor TabNet
def load_and_evaluate_tabnet(X_train, y_train, X_test, y_test, variant_name):
    # Haal de hyperparameters op voor de huidige variant
    params = tabnet_hyperparameters[variant_name]

    # Laad het TabNet model met de opgegeven hyperparameters
    tabnet_model = TabNetClassifier(
        n_d=params['n_da'], 
        n_a=params['n_da'], 
        n_steps=params['n_steps'], 
        gamma=params['gamma'],
        lambda_sparse=params['lambda_sparse'], 
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type=params['mask_type'], 
        n_shared=params['n_shared'],
        scheduler_params=dict(
            patience=params['patienceScheduler'],
            min_lr=1e-5,
            factor=0.5,
        ),
        scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
        verbose=0,
        seed=SEED,  # Seed toegevoegd
    )
    
    # Train het model met de opgegeven hyperparameters
    tabnet_model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        max_epochs=params['max_epochs'],
        patience=params['patience'],
        batch_size=64
    )
    
    # Voorspellingen doen
    y_pred_tabnet = tabnet_model.predict(X_test)
    
    # Genereer SHAP plot
    generate_shap_plot_tabnet(tabnet_model, X_test)
    
    # Evaluatie en metrics
    evaluate(y_test, y_pred_tabnet, f"Tuned TabNet for {variant_name}")

    # Confusion Matrix en classification report
    cm = confusion_matrix(y_test, y_pred_tabnet)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(f"Confusion Matrix: Tuned TabNet for {variant_name}")
    plt.show()

    class_names = ["class_1", "class_2", "class_3"]
    print(classification_report(y_test, y_pred_tabnet, target_names=class_names))

# Voor elke dataset variant
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:  # Skip test variants
        continue  

    print(f"\nRunning TabNet on dataset variant: {variant_name}")

    # Bepaal het testset op basis van de training variant
    if variant_name == 'trainmet_odds_met_balancing':
        X_test = datasets['testmet_odds_met_balancing'][0]
        y_test = datasets['testmet_odds_met_balancing'][1]
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test = datasets['testmet_odds_zonder_balancing'][0]
        y_test = datasets['testmet_odds_zonder_balancing'][1]
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test = datasets['testzonder_odds_met_balancing'][0]
        y_test = datasets['testzonder_odds_met_balancing'][1]
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test = datasets['testzonder_odds_zonder_balancing'][0]
        y_test = datasets['testzonder_odds_zonder_balancing'][1]
    else:
        print(f"Unknown variant name {variant_name}, skipping.")
        continue

    # Zorg ervoor dat de data in de juiste vorm is (arrays)
    X_train = X_train.values
    y_train = y_train.values
    X_test = X_test.values
    y_test = y_test.values

    # Train model en genereer SHAP plot
    load_and_evaluate_tabnet(X_train, y_train, X_test, y_test, variant_name)



In [None]:
import shap
import matplotlib.pyplot as plt
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import pandas as pd

# Stel de random seed in voor consistentie
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Functie om SHAP-plot te genereren voor TabNet
def generate_shap_plot_tabnet(model, X_test, feature_names, variant_name):
    explainer = shap.Explainer(model.predict, X_test)
    shap_values = explainer(X_test)
    shap.summary_plot(shap_values, X_test, feature_names=feature_names)
    plt.title(f"SHAP Summary for {variant_name} (TabNet)", fontsize=14)
    plt.show()
    plt.savefig(f"SHAP_TabNet_Summary_{variant_name}.png", bbox_inches='tight')
    plt.close()

# Functie voor modeltraining en evaluatie
def train_and_evaluate_tabnet(X_train, y_train, X_test, y_test, variant_name, feature_names):
    # Laad het TabNet model met de opgegeven hyperparameters
    tabnet_model = TabNetClassifier(
        **tabnet_hyperparameters[variant_name],
        seed=SEED
    )
    
    # Train het TabNet model
    tabnet_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], 
                     max_epochs=tabnet_hyperparameters[variant_name]['max_epochs'], 
                     patience=tabnet_hyperparameters[variant_name]['patience'])
    
    # Genereer SHAP plot voor TabNet
    generate_shap_plot_tabnet(tabnet_model, X_test, feature_names, variant_name)

    # Voorspellingen doen
    y_pred_tabnet = tabnet_model.predict(X_test)

    # Evaluatie en metrics
    print(f"Results for {variant_name} (TabNet):")
    print(f"Accuracy: {accuracy_score(y_test, y_pred_tabnet)}")
    print(classification_report(y_test, y_pred_tabnet))

    # Confusion Matrix
    cm_tabnet = confusion_matrix(y_test, y_pred_tabnet)
    disp_tabnet = ConfusionMatrixDisplay(confusion_matrix=cm_tabnet)
    disp_tabnet.plot(cmap='Blues')
    plt.title(f"Confusion Matrix for {variant_name} (TabNet)")
    plt.show()

# Itereer door de datasets voor verschillende varianten
for variant_name, (X_train, y_train) in datasets.items():
    if 'test' in variant_name:  # Skip test datasets
        continue

    print(f"\nRunning TabNet on dataset variant: {variant_name}")

    if variant_name == 'trainmet_odds_met_balancing':
        X_test, y_test = datasets['testmet_odds_met_balancing']
    elif variant_name == 'trainmet_odds_zonder_balancing':
        X_test, y_test = datasets['testmet_odds_zonder_balancing']
    elif variant_name == 'trainzonder_odds_met_balancing':
        X_test, y_test = datasets['testzonder_odds_met_balancing']
    elif variant_name == 'trainzonder_odds_zonder_balancing':
        X_test, y_test = datasets['testzonder_odds_zonder_balancing']

    # Zorg ervoor dat de data in de juiste vorm is (DataFrame)
    feature_names = feature_names_dict[variant_name]
    X_train = pd.DataFrame(X_train, columns=feature_names)
    X_test = pd.DataFrame(X_test, columns=feature_names)

    # Train en evalueer het TabNet model
    train_and_evaluate_tabnet(X_train, y_train, X_test, y_test, variant_name, feature_names)

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import zscore

# Functie om uitbijters op basis van Z-score te verwijderen
def remove_outliers_zscore(df, threshold=3):
    # Bereken de Z-score voor elke kolom in de DataFrame (alleen numerieke kolommen)
    z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))  # Zorg ervoor dat we alleen numerieke kolommen gebruiken
    
    # Verwijder de rijen waar de Z-score groter is dan de drempel
    df_no_outliers = df[(z_scores < threshold).all(axis=1)]
    return df_no_outliers

# Stel dat final_training je oorspronkelijke DataFrame is
# Gebruik de functie om de uitbijters te verwijderen
final_training_no_outliers = remove_outliers_zscore(train_data, threshold=3)

print("Data zonder uitbijters:")
print(final_training_no_outliers)
print(final_training_no_outliers.shape)