In [118]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/elo-scores/ELO_Ratings/ManCity.csv
/kaggle/input/elo-scores/ELO_Ratings/Liverpool.csv
/kaggle/input/elo-scores/ELO_Ratings/werder.csv
/kaggle/input/elo-scores/ELO_Ratings/Milan.csv
/kaggle/input/elo-scores/ELO_Ratings/Olympiacos.csv
/kaggle/input/elo-scores/ELO_Ratings/Marseille.csv
/kaggle/input/elo-scores/ELO_Ratings/Panathinaikos.csv
/kaggle/input/elo-scores/ELO_Ratings/Lyon.csv
/kaggle/input/elo-scores/ELO_Ratings/Dortmund.csv
/kaggle/input/elo-scores/ELO_Ratings/Inter.csv
/kaggle/input/elo-scores/ELO_Ratings/Benfica.csv
/kaggle/input/elo-scores/ELO_Ratings/Wolfsburg.csv
/kaggle/input/elo-scores/ELO_Ratings/Lille.csv
/kaggle/input/elo-scores/ELO_Ratings/Celtic.csv
/kaggle/input/elo-scores/ELO_Ratings/Napoli.csv
/kaggle/input/elo-scores/ELO_Ratings/Monaco.csv
/kaggle/input/elo-scores/ELO_Ratings/Basel.csv
/kaggle/input/elo-scores/ELO_Ratings/ParisSG.csv
/kaggle/input/elo-scores/ELO_Ratings/Bayern.csv
/kaggle/input/elo-scores/ELO_Ratings/PSV.csv
/kaggle/input/elo-scores

In [119]:
import optuna
from sklearn.model_selection import cross_val_score, StratifiedKFold


In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime

# Load training data
df = pd.read_csv("/best_ouput_training_data.csv")

# Label encode team names and date features
team_encoder = LabelEncoder()

# Fit on all unique values in both columns
all_teams = pd.concat([df["team1"], df["team2"]]).unique()
team_encoder.fit(all_teams)
df['team1'] = team_encoder.transform(df['team1'])
df['team2'] = team_encoder.transform(df['team2'])
last_index = len(team_encoder.classes_)-1

df['date'] = pd.to_datetime(df['date'], dayfirst=True)
# df['year'] = df['date'].dt.year
# df['month'] = df['date'].dt.month
# df['day'] = df['date'].dt.day
df['elo_ratio'] = df['elo_team1']/df['elo_team2']
df.drop(columns=['elo_team1','elo_team2','team1_home','result','elo_diff'],inplace=True)

# Drop original date
X = df.drop(columns=['team1_wins', 'match_no'])
y = df['team1_wins']

# Preprocessing pipeline
numeric_features = X.select_dtypes(include=[np.number]).columns
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features)
])

# XGBoost Regressor
regressor = XGBClassifier(n_estimators=1000, max_depth=4, learning_rate=0.1)
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("regressor", regressor)
])





In [97]:
def objective(trial):
    # Suggest hyperparameters
    params = {
        'regressor__n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'regressor__max_depth': trial.suggest_int('max_depth', 3, 10),
        'regressor__learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'regressor__subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'regressor__colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'regressor__gamma': trial.suggest_float('gamma', 0, 5),
        'regressor__reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'regressor__reg_lambda': trial.suggest_float('reg_lambda', 0, 5)
    }
    
    # Set parameters in pipeline
    pipeline.set_params(**params)
    
    # Cross-validation setup with stratified splits
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    # Use clf_pipeline (your wrapped classifier) for scoring
    scores = cross_val_score(
        pipeline,  # Use the wrapped classifier
        X, y,
        cv=cv,
        scoring='neg_log_loss',  # Or 'roc_auc'
        n_jobs=-1
    )
    
    return np.mean(scores)

In [98]:
# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=35)

# Get best parameters
best_params = study.best_params
best_score = study.best_value

print("Best parameters:", best_params)
print("Best score (negative log_loss):", best_score)

[I 2025-06-09 14:27:44,128] A new study created in memory with name: no-name-d972ceb0-490f-4beb-8e5d-6d0ef597a212
[I 2025-06-09 14:27:45,330] Trial 0 finished with value: -0.6315959655533091 and parameters: {'n_estimators': 292, 'max_depth': 5, 'learning_rate': 0.00017581916556461827, 'subsample': 0.7382445713546564, 'colsample_bytree': 0.6561848048319536, 'gamma': 1.9620791303069702, 'reg_alpha': 0.7841643413574173, 'reg_lambda': 2.236918170956539}. Best is trial 0 with value: -0.6315959655533091.
[I 2025-06-09 14:27:46,069] Trial 1 finished with value: -0.4062428841708312 and parameters: {'n_estimators': 175, 'max_depth': 4, 'learning_rate': 0.020163264344407194, 'subsample': 0.6138443099241698, 'colsample_bytree': 0.9058695646712075, 'gamma': 2.282584596007439, 'reg_alpha': 3.7435196087175675, 'reg_lambda': 1.4675334191137095}. Best is trial 1 with value: -0.4062428841708312.
[I 2025-06-09 14:27:46,350] Trial 2 finished with value: -0.5262868166970639 and parameters: {'n_estimators'

Best parameters: {'n_estimators': 542, 'max_depth': 10, 'learning_rate': 0.06759519478597421, 'subsample': 0.9406759075167906, 'colsample_bytree': 0.8813104083201415, 'gamma': 0.15037034138784483, 'reg_alpha': 2.7506104941235288, 'reg_lambda': 3.959839882009687}
Best score (negative log_loss): -0.3147071488933835


In [99]:
# Correct parameter names for pipeline
best_params_corrected = {f'regressor__{k}': v for k, v in best_params.items()}

# Retrain with corrected parameters
pipeline.set_params(**best_params_corrected)


In [121]:
pipeline.fit(X, y)

In [122]:
class WrappedModel(BaseEstimator):
    def __init__(self, reg_model):
        self.reg_model = reg_model

    def predict(self, X):
        probs = self.reg_model.predict_proba(X)
        return (probs > 0.5).astype(int)[:, 1]

clf_pipeline = WrappedModel(pipeline)

In [123]:
def calculate_team_stats_up_to_date(df, team, season, cutoff_date):
    """
    Calculate team stats only for matches played up to the cutoff date
    We do not need to do seperately for opponent and team, because of the nature of this dataset.
    We donot even need to consider home and away games due to the nature of the league.
    """
    df['Date'] = pd.to_datetime(df['Date'])
    cutoff_date = pd.to_datetime(cutoff_date)
    df['Team'] = df['Team'].str.strip()
    team = team.upper()
    df['season'] = df['season'].astype(str).str.strip()
    df['season'] = df['season'].astype(int)  # Ensure Python int type
    season = np.int64(season)
    # Filter matches for this team and season, up to the cutoff date
    team_matches = df[
        (df['Team'] == team) & 
        (df['Date'] < cutoff_date) &
        (df['season'].between(season - 2, season))
    ].copy()
    if team_matches.empty:
        team = team.split()
        for word in team:
            team_matches = df[(df['Team'].str.contains(word.upper(), case=False, na=False)) &
            (df['season'].between(season - 2, season)) & (df['Date'] < cutoff_date)]
            if not team_matches.empty:
                break
    if team_matches.empty:
        return None

    matches_played = len(team_matches)
    goals_scored = team_matches['Team_Score'].sum()
    goals_conceded = team_matches['Opponent_Score'].sum()
    points = team_matches['Team_Points'].sum()
    
    goals_per_game = goals_scored / matches_played
    goals_conceded_per_game = goals_conceded / matches_played
    points_per_game = points / matches_played
    goal_difference = goals_scored - goals_conceded
    goal_diff_per_game = goal_difference / matches_played
    
    return {
        'matches_played': matches_played,
        'goals_scored': goals_scored,
        'goals_conceded': goals_conceded,
        'points': points,
        'goals_per_game': goals_per_game,
        'goals_conceded_per_game': goals_conceded_per_game,
        'points_per_game': points_per_game,
        'goal_difference': goal_difference,
        'goal_diff_per_game': goal_diff_per_game,
        'attack_strength': goals_per_game,
        'defense_strength': goals_conceded_per_game
    }
    
    return calculated_stats

In [124]:
def create_match_features(team1, team2, season, match_date, original_match_data):
    # Convert input team names to lowercase
    team1 = team1.lower()
    team2 = team2.lower()
    ucl_matches = original_match_data[original_match_data['Competition'] == 'uefa-champions-league'].copy()
    league_matches = original_match_data[original_match_data['Competition'] != 'uefa-champions-league'].copy()
    # Calculate UCL stats up to match date
    team1_ucl_stats = calculate_team_stats_up_to_date(ucl_matches, team1, season, match_date)
    team2_ucl_stats = calculate_team_stats_up_to_date(ucl_matches, team2, season, match_date)

    # Calculate League stats up to match date
    team1_league_stats = calculate_team_stats_up_to_date(league_matches, team1, season, match_date)
    team2_league_stats = calculate_team_stats_up_to_date(league_matches, team2, season, match_date)

    if not team1_ucl_stats and not team2_ucl_stats:
        # Use league stats only
        if not team1_league_stats or not team2_league_stats:
            return None

        def safe_divide(numerator, denominator):
            return numerator / denominator if denominator != 0 else np.nan
    
        features = {
                # Use league stats as primary
                'attack_strength_ratio': team1_league_stats['attack_strength'] / max(team2_league_stats['attack_strength'], 0.1),
                'ucl_attack_ratio': 1.0,  # No UCL data
                'league_attack_ratio': team1_league_stats['attack_strength'] / max(team2_league_stats['attack_strength'], 0.1),
                
                'defense_strength_ratio': max(team2_league_stats['defense_strength'], 0.1) / max(team1_league_stats['defense_strength'], 0.1),
                'ucl_defense_ratio': 1.0,  # No UCL data
                'league_defense_ratio': max(team2_league_stats['defense_strength'], 0.1) / max(team1_league_stats['defense_strength'], 0.1),
                
                'overall_form_ratio': team1_league_stats['points_per_game'] / max(team2_league_stats['points_per_game'], 0.1),
                'ucl_form_ratio': 1.0,  # No UCL data
                'league_form_ratio': team1_league_stats['points_per_game'] / max(team2_league_stats['points_per_game'], 0.1),
                
                'goal_diff_ratio': team1_league_stats['goal_diff_per_game'] - team2_league_stats['goal_diff_per_game'],
                'league_goal_diff_ratio': team1_league_stats['goal_diff_per_game'] - team2_league_stats['goal_diff_per_game'],
                'ucl_experience_diff': 0
            }
    else:
        if not team1_ucl_stats:
            team1_ucl_stats = {'attack_strength': 0, 'defense_strength': 1, 'points_per_game': 0, 'goal_diff_per_game': 0, 'matches_played': 0}
        if not team2_ucl_stats:
            team2_ucl_stats = {'attack_strength': 0, 'defense_strength': 1, 'points_per_game': 0, 'goal_diff_per_game': 0, 'matches_played': 0}
        if not team1_league_stats:
            team1_league_stats = {'attack_strength': 0, 'defense_strength': 1, 'points_per_game': 0, 'goal_diff_per_game': 0}
        if not team2_league_stats:
            team2_league_stats = {'attack_strength': 0, 'defense_strength': 1, 'points_per_game': 0, 'goal_diff_per_game': 0}
        
        # Calculate combined attack and defense strength (weighted)
        team1_total_attack = 0.8 * team1_ucl_stats['attack_strength'] + 0.2 * team1_league_stats['attack_strength']
        team2_total_attack = 0.8 * team2_ucl_stats['attack_strength'] + 0.2 * team2_league_stats['attack_strength']
        
        team1_total_defense = 0.8 * team1_ucl_stats['defense_strength'] + 0.2 * team1_league_stats['defense_strength']
        team2_total_defense = 0.8 * team2_ucl_stats['defense_strength'] + 0.2 * team2_league_stats['defense_strength']
        
        team1_overall_form = 0.9 * team1_ucl_stats['points_per_game'] + 0.1 * team1_league_stats['points_per_game']
        team2_overall_form = 0.9 * team2_ucl_stats['points_per_game'] + 0.1 * team2_league_stats['points_per_game']
        
        features = {
            # Attack ratios
            'attack_strength_ratio': team1_total_attack / max(team2_total_attack, 0.1),
            'ucl_attack_ratio': team1_ucl_stats['attack_strength'] / max(team2_ucl_stats['attack_strength'], 0.1),
            'league_attack_ratio': team1_league_stats['attack_strength'] / max(team2_league_stats['attack_strength'], 0.1),
            
            # Defense ratios (lower is better, so we flip)
            'defense_strength_ratio': max(team2_total_defense, 0.1) / max(team1_total_defense, 0.1),
            'ucl_defense_ratio': max(team2_ucl_stats['defense_strength'], 0.1) / max(team1_ucl_stats['defense_strength'], 0.1),
            'league_defense_ratio': max(team2_league_stats['defense_strength'], 0.1) / max(team1_league_stats['defense_strength'], 0.1),
            
            # Form ratios
            'overall_form_ratio': team1_overall_form / max(team2_overall_form, 0.1),
            'ucl_form_ratio': team1_ucl_stats['points_per_game'] / max(team2_ucl_stats['points_per_game'], 0.1),
            'league_form_ratio': team1_league_stats['points_per_game'] / max(team2_league_stats['points_per_game'], 0.1),
            
            # Goal difference ratios
            'goal_diff_ratio': team1_ucl_stats['goal_diff_per_game'] - team2_ucl_stats['goal_diff_per_game'],
            'league_goal_diff_ratio': team1_league_stats['goal_diff_per_game'] - team2_league_stats['goal_diff_per_game'],
            
            # Experience features
            'ucl_experience_diff': team1_ucl_stats['matches_played'] - team2_ucl_stats['matches_played']
        }

    return features



In [125]:
def add_match_features(fd, original_df):
    # Original_df has all the matches historically basically the EU Soccer data.
    feature_columns =  ['attack_strength_ratio', 'ucl_attack_ratio', 'league_attack_ratio',
                        'defense_strength_ratio', 'ucl_defense_ratio', 'league_defense_ratio', 
                        'overall_form_ratio', 'ucl_form_ratio', 'league_form_ratio',
                        'goal_diff_ratio', 'league_goal_diff_ratio', 'ucl_experience_diff']

    

    for col in feature_columns:
        fd[col] = None
    # Original_df is required when we are going to be using rolling stats.
    fd['date'] = pd.to_datetime(fd['date'])
    original_df['Date'] = pd.to_datetime(original_df['Date'], dayfirst=True)
    for idx, row in fd.iterrows():
        features = create_match_features(
            row['team1'], 
            row['team2'], 
            row['season'], 
            row['date'],  # Pass the match date
            original_df
        )

        if features:
            for feature_name, feature_value in features.items():
                fd.at[idx, feature_name] = feature_value
        else:
            print(f"Warning: Could not calculate features for {row['team1']} vs {row['team2']} in {row['season']}")

    #  # # Add Elo ratings for team1 and team2
    fd['elo_team1'] = fd.apply(lambda x: get_elo(x['team1'], x['date']), axis=1)
    fd['elo_team2'] = fd.apply(lambda x: get_elo(x['team2'], x['date']), axis=1)
    fd['elo_ratio'] = np.where(
    fd[['elo_team1', 'elo_team2']].notna().all(axis=1),
    fd['elo_team1'] / fd['elo_team2'],
    np.nan
    )
    fd.drop(columns=['elo_team1','elo_team2'], inplace=True)
    # fd[features_to_impute] = imputer.transform(fd[features_to_impute])
        

    return fd

In [76]:
# Map to ClubElo URL format (adjust as needed, check on clubelo.com)
elo_url_map = {
    'ajax': 'Ajax',
    'apoel': 'APOEL',
    'arsenal': 'Arsenal',
    'atletico madrid': 'Atletico',
    'barcelona': 'Barcelona',
    'basel': 'Basel',
    'bayer leverkusen': 'Leverkusen',
    'bayern munich': 'Bayern',
    'benfica': 'Benfica',
    'bordeaux': 'Bordeaux',
    'borussia dortmund': 'Dortmund',
    'celtic': 'Celtic',
    'chelsea': 'Chelsea',
    'copenhagen': 'Copenhagen',
    'cska moscow': 'CSKAMoskva',
    'dynamo kyiv': 'DynamoKyiv',
    'fenerbahçe': 'Fenerbahce',  # Note: remove accents
    'fiorentina': 'Fiorentina',
    'galatasaray': 'Galatasaray',
    'gent': 'Gent',
    'inter': 'Inter',
    'juventus': 'Juventus',
    'leicester city': 'Leicester',
    'lille': 'Lille',
    'liverpool': 'Liverpool',
    'lyon': 'Lyon',
    'malaga': 'Malaga',
    'manchester city': 'ManCity',
    'manchester united': 'ManUnited',
    'marseille': 'Marseille',
    'milan': 'Milan',
    'monaco': 'Monaco',
    'napoli': 'Napoli',
    'olympiacos': 'Olympiacos',
    'panathinaikos': 'Panathinaikos',
    'paris saint-germain': 'ParisSG',
    'porto': 'Porto',
    'psv eindhoven': 'PSV',
    'rangers': 'Rangers',
    'real madrid': 'RealMadrid',
    'roma': 'Roma',
    'schalke 04': 'Schalke',
    'sevilla': 'Sevilla',
    'shakhtar donetsk': 'Shakhtar',
    'sporting cp': 'sporting',
    'stuttgart': 'Stuttgart',
    'tottenham hotspur': 'Tottenham',
    'valencia': 'Valencia',
    'villarreal': 'Villarreal',
    'werder bremen': 'werder',
    'wolfsburg': 'Wolfsburg',
    'zenit saint petersburg': 'Zenit'
}

In [77]:
import glob
elo_dict = {}
elo_dir = '/kaggle/input/elo-scores/ELO_Ratings'

for filepath in glob.glob(f"{elo_dir}/*.csv"):
    team = filepath.split('/')[-1].replace('.csv', '').lower()
    elo_df = pd.read_csv(filepath)
    elo_df['From'] = pd.to_datetime(elo_df['From'])
    elo_df['To'] = pd.to_datetime(elo_df['To'])
    elo_dict[team] = elo_df

In [78]:
def get_elo(team, date):
    if team.lower() not in elo_url_map:
        return None
    elo_df = elo_dict[elo_url_map[team.lower()].lower()]
    mask = (elo_df['From'] <= date) & (elo_df['To'] >= date)
    if mask.sum() == 0:
        # If no exact match, take the most recent before date
        mask = elo_df['From'] <= date
        if mask.sum() == 0:
            return None
        return elo_df[mask].iloc[-1]['Elo']
    else:
        return elo_df[mask].iloc[0]['Elo']

In [126]:
dfData = pd.read_csv("/kaggle/input/european-soccer-data/Full_Dataset.csv")

In [127]:
def create_ucl_testing_dataframe(fd):
    fd = add_match_features(fd, dfData.copy())
    return fd

In [128]:
def predict_winner(match, season, _round):
    global last_index
    row = {
        "season":season,
        "round": _round,
        "team1": match["team_1"],
        "team2": match["team_2"],
        "date": match["date"]
    }
    
    for col in X.columns:
        if col not in row:
            row[col] = 0
    row_df = pd.DataFrame([row])[X.columns]
    row_df['team1'] = row_df['team1'].replace('internazionale', 'inter')
    row_df['team2'] = row_df['team2'].replace('internazionale', 'inter')
    row_df['date'] = match['date']
    if 'date' in row_df.columns and not row_df['date'].isna().all():
        try:
            row_df['date'] = pd.to_datetime(row_df['date'], dayfirst=False)
        except:
            pass
    row_df = create_ucl_testing_dataframe(row_df)

    

    
    
    if match["team_1"].lower() in team_encoder.classes_:
        row_df['team1'] = team_encoder.transform([match["team_1"].lower()])[0]
    else:
    # Add new team, refit encoder
        row_df['team1'] = last_index+1
        last_index += 1
        
    if match["team_2"].lower() in team_encoder.classes_:
        row_df['team2'] = team_encoder.transform([match["team_2"].lower()])[0]
    else:
    # Add new team, refit encoder
        row_df['team2'] = last_index+1
        last_index += 1
        
    result = clf_pipeline.predict(row_df)[0]
    return match['team_1'] if result == 1 else match['team_2']

In [129]:
def process_round_16(matches, season):
    winners = {}
    for match in matches:
        teamA = match["team_1"]
        teamB = match["team_2"]
        winner = predict_winner(match, season, "Round Of 16")
        key = f"Winner of {teamA} vs {teamB}"
        winners[key] = winner
        match["winner"] = winner
    return winners

In [130]:
def process_round_qf(matches, season):
    winners = {}
    count = 1
    for match in matches:
        teamA = match["team_1"]
        teamB = match["team_2"]
        winner = predict_winner(match, season, "Quarter Finals")
        key = f"Winner of QF{count}"
        count += 1
        winners[key] = winner
        match["winner"] = winner
    return winners

In [131]:
def process_round_sf(matches, season):
    winners = {}
    count = 1
    for match in matches:
        teamA = match["team_1"]
        teamB = match["team_2"]
        winner = predict_winner(match, season, "Semi Finals")
        key = f"Winner of SF{count}"
        count += 1
        winners[key] = winner
        match["winner"] = winner
    return winners

In [132]:
def resolve_matches(matches, winners):
    for match in matches:
        for side in ["team_1", "team_2"]:
            if match[side] in winners:
                match[side] = winners[match[side]]

In [133]:
import json
import pandas as pd
with open('/kaggle/input/ucl-datathon/test_matchups (1).json') as file:
    data = json.load(file)

def run_tournament(json_data):
    for season, rounds in json_data.items():
        print(f"Processing season {season}")
        winners = process_round_16(rounds['round_of_16_matchups'], int(season[:4]))
        print(winners)
        resolve_matches(rounds['quarter_finals_matchups'], winners)

        winners = process_round_qf(rounds['quarter_finals_matchups'], int(season[:4]))
        print(winners)
        resolve_matches(rounds['semi_finals_matchups'], winners)
        
        winners = process_round_sf(rounds['semi_finals_matchups'], int(season[:4]))
        print(winners)
        
        final = rounds['final_matchup']
        for side in ["team_1", "team_2"]:
            if final[side] in winners:
                final[side] = winners[final[side]]
        winner = predict_winner(final, int(season[:4]), "Final")
        print(f"Final winner for {season}: {winner}\n")
        final["winner"] = winner
        
    output_file = "/kaggle/working/exported_data.json"
    # Exporting JSON data to a file
    with open(output_file, "w") as f:
        json.dump(json_data, f, indent=4)

In [134]:
run_tournament(data)

Processing season 2017-18
{'Winner of Juventus vs Tottenham Hotspur': 'Juventus', 'Winner of Basel vs Manchester City': 'Manchester City', 'Winner of Porto vs Liverpool': 'Porto', 'Winner of Sevilla vs Manchester United': 'Manchester United', 'Winner of Real Madrid vs Paris Saint-Germain': 'Real Madrid', 'Winner of Shakhtar Donetsk vs Roma': 'Roma', 'Winner of Chelsea vs Barcelona': 'Barcelona', 'Winner of Bayern Munich vs Besiktas': 'Bayern Munich'}
{'Winner of QF1': 'Real Madrid', 'Winner of QF2': 'Manchester City', 'Winner of QF3': 'Manchester United', 'Winner of QF4': 'Barcelona'}
{'Winner of SF1': 'Barcelona', 'Winner of SF2': 'Manchester United'}
Final winner for 2017-18: Manchester United

Processing season 2018-19
{'Winner of Roma vs Porto': 'Porto', 'Winner of Manchester United vs Paris Saint-Germain': 'Manchester United', 'Winner of Tottenham Hotspur vs Borussia Dortmund': 'Borussia Dortmund', 'Winner of Ajax vs Real Madrid': 'Real Madrid', 'Winner of Lyon vs Barcelona': 'Bar

In [135]:
# Load your JSON data
with open('/kaggle/working/exported_data.json', 'r') as f:
    data = json.load(f)

# Prepare the output DataFrame
submission = []

for season, rounds in data.items():
    # Organize matchups by round
    predictions = {
        'round_of_16': rounds['round_of_16_matchups'],
        'quarter_finals': rounds['quarter_finals_matchups'],
        'semi_finals': rounds['semi_finals_matchups'],
        'final': [rounds['final_matchup']]
    }
    # Append to submission
    submission.append({
        'id': int(season.split('-')[0]) - 2017,  # Or use any unique id strategy
        'season': season,
        'predictions': json.dumps(predictions)
    })



In [136]:
submission_df_optuna = pd.DataFrame(submission)
submission_df_optuna.head()

Unnamed: 0,id,season,predictions
0,0,2017-18,"{""round_of_16"": [{""team_1"": ""Juventus"", ""team_..."
1,1,2018-19,"{""round_of_16"": [{""team_1"": ""Roma"", ""team_2"": ..."
2,2,2019-20,"{""round_of_16"": [{""team_1"": ""Borussia Dortmund..."
3,3,2020-21,"{""round_of_16"": [{""team_1"": ""RB Leipzig"", ""tea..."
4,4,2021-22,"{""round_of_16"": [{""team_1"": ""Paris Saint-Germa..."


In [137]:
def remove_date_from_predictions(row):
    predictions = json.loads(row['predictions'])
    for round_name in predictions:
        for match in predictions[round_name]:
            if 'date' in match:
                del match['date']
    row['predictions'] = json.dumps(predictions)
    return row

In [138]:
submission_df_optuna = submission_df_optuna.apply(remove_date_from_predictions, axis=1)

In [139]:
submission_df_optuna.to_csv('classifier_model_no_optuna.csv')