In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

df_events = pd.read_csv('ufc_events.csv')
df_fights = pd.read_csv('ufc_fights.csv')
df_fighters = pd.read_csv('ufc_fighters_basic_with_dob.csv')

In [2]:
def parse_dob(df, in_col="dob", out_col="dob_dt"):
    """
    Konvertiert eine Spalte mit DOB-Strings wie 'Jul 03, 1983' zu datetime.
    Ungültige Einträge -> NaT.
    """
    # vorher evtl. Whitespace säubern
    s = df[in_col].astype(str).str.strip()
    df[out_col] = pd.to_datetime(s, format="%b %d, %Y", errors="coerce")
    return df
df_fighters = parse_dob(df_fighters, in_col="dob", out_col="dob_dt")

In [3]:

df_fighters['full_name'] = (
    df_fighters['first_name'].str.strip() + ' ' + df_fighters['last_name'].str.strip())
stats = ['height', 'weight', 'reach', 'stance','dob_dt']
red_stats = df_fighters[['full_name'] + stats] \
    .rename(columns={c: f'red_{c}' for c in stats})
df_fights = df_fights.merge(
    red_stats,
    left_on='fighter_red',
    right_on='full_name',
    how='left'
).drop(columns='full_name')
blue_stats = df_fighters[['full_name'] + stats] \
    .rename(columns={c: f'blue_{c}' for c in stats})

df_fights = df_fights.merge(
    blue_stats,
    left_on='fighter_blue',
    right_on='full_name',
    how='left'
).drop(columns='full_name')
df_fights = df_fights.merge(df_events, on='event_name', how='inner')


In [4]:

# Elo-Parameter
BASE_ELO = 1000
K = 32

# Dictionary für aktuelle Elos
elos = {}
df_fights['event_date'] = pd.to_datetime(df_fights['event_date'])
# Neue Spalten anlegen
df_fights["fighter_red_elo_before"] = 0
df_fights["fighter_blue_elo_before"] = 0
df_fights["elo_diff"] = 0
df_fights["fighter_red_elo_after"] = 0
df_fights["fighter_blue_elo_after"] = 0
df_fights["fighter_red_elo_change"] = 0
df_fights["fighter_blue_elo_change"] = 0
df_fights = df_fights.sort_values('event_date',ascending=False).reset_index(drop=True)

In [5]:
def expected(elo_a, elo_b):
    return 1 / (1 + 10 ** ((elo_b - elo_a) / 400))

for idx, row in df_fights.iloc[::-1].iterrows():
    red = row['fighter_red']
    blue = row['fighter_blue']
    winner = row['winner']

    # Hole aktuelle Elo, falls neu: BASE_ELO
    elo_red = elos.get(red, BASE_ELO)
    elo_blue = elos.get(blue, BASE_ELO)

    # Speichere aktuelle Elo vor Kampf
    df_fights.at[idx, "fighter_red_elo_before"] = elo_red
    df_fights.at[idx, "fighter_blue_elo_before"] = elo_blue
    df_fights.at[idx, "elo_diff"] = elo_blue - elo_red

    # Berechne Erwartungswert
    exp_red = expected(elo_red, elo_blue)
    exp_blue = expected(elo_blue, elo_red)

    # Wer hat gewonnen?
    if winner == red:
        score_red, score_blue = 1, 0
    elif winner == blue:
        score_red, score_blue = 0, 1
    else:
        score_red, score_blue = 0.5, 0.5  # Unentschieden (falls nötig)

    # Neue Elo berechnen
    new_elo_red = round(elo_red + K * (score_red - exp_red))
    new_elo_blue = round(elo_blue + K * (score_blue - exp_blue))

    # Elo-Änderung speichern
    df_fights.at[idx, "fighter_red_elo_after"] = new_elo_red
    df_fights.at[idx, "fighter_blue_elo_after"] = new_elo_blue
    df_fights.at[idx, "fighter_red_elo_change"] = new_elo_red - elo_red
    df_fights.at[idx, "fighter_blue_elo_change"] = new_elo_blue - elo_blue

    # Update Elo
    elos[red] = new_elo_red
    elos[blue] = new_elo_blue

In [7]:
def time_to_seconds(t):
    """z.B. '1:34' → 94"""
    if pd.isnull(t):
        return np.nan
    try:
        mins, secs = map(int, str(t).split(':'))
        return mins * 60 + secs
    except Exception:
        return np.nan

df_fights['time_seconds'] = df_fights['time'].apply(time_to_seconds)
for color in ['red', 'blue']:
    # Durchschnittliche Stats vor jedem Kampf
    for stat in ['kd', 'str', 'td', 'sub']:
        df_fights[f'{stat}_{color}_avg_before'] = 0
    df_fights[f'{color}_losses_before'] = 0
    df_fights[f'{color}_avg_rounds_before'] = 0
    df_fights[f'{color}_avg_time_before'] = 0


In [8]:
df_fights.head()

Unnamed: 0,event_name,fighter_red,fighter_blue,winner,kd_red,kd_blue,str_red,str_blue,td_red,td_blue,...,red_losses_before,red_avg_rounds_before,red_avg_time_before,kd_blue_avg_before,str_blue_avg_before,td_blue_avg_before,sub_blue_avg_before,blue_losses_before,blue_avg_rounds_before,blue_avg_time_before
0,UFC Fight Night: Whittaker vs. De Ridder,Robert Whittaker,Reinier de Ridder,Robert Whittaker,,,,,View,Matchup,...,0,0,0,0,0,0,0,0,0,0
1,UFC Fight Night: Whittaker vs. De Ridder,Davey Grant,Da'Mon Blackshear,Davey Grant,,,,,View Matchup,,...,0,0,0,0,0,0,0,0,0,0
2,UFC Fight Night: Whittaker vs. De Ridder,Petr Yan,Marcus McGhee,Petr Yan,,,,,View Matchup,,...,0,0,0,0,0,0,0,0,0,0
3,UFC Fight Night: Whittaker vs. De Ridder,Martin Buday,Marcus Buchecha,Martin Buday,,,,,View Matchup,,...,0,0,0,0,0,0,0,0,0,0
4,UFC Fight Night: Whittaker vs. De Ridder,Mohammad Yahya,Steven Nguyen,Mohammad Yahya,,,,,View Matchup,,...,0,0,0,0,0,0,0,0,0,0


In [9]:
import pandas as pd

# 0) Kopie, Zeit in Sekunden & nach Datum sortieren
df = df_fights.copy()
df['time_seconds'] = df['time'].apply(time_to_seconds)
df = df.sort_values('event_date').reset_index(drop=True)


# ---- NEW: DOBs + event_date sicher in datetime umwandeln & Age berechnen ----
for c in ['event_date', 'red_dob_dt', 'blue_dob_dt']:
    df[c] = pd.to_datetime(df[c], errors='coerce')

# Alter in Jahren (float). Optional: zusätzlich als ganzzahlig abrunden.
df['red_age']  = (df['event_date'] - df['red_dob_dt']).dt.days / 365.25
df['blue_age'] = (df['event_date'] - df['blue_dob_dt']).dt.days / 365.25
# optional:
df['red_age_int']  = np.floor(df['red_age']).astype('Int64')
df['blue_age_int'] = np.floor(df['blue_age']).astype('Int64')

# — Neu: alle kd/str/td/sub und round/ time_seconds in numeric casten —
for color in ['red','blue']:
    for stat in ['kd','str','td','sub']:
        col = f'{stat}_{color}'
        df[col] = pd.to_numeric(df[col], errors='coerce')
df['round']        = pd.to_numeric(df['round'], errors='coerce')
df['time_seconds'] = pd.to_numeric(df['time_seconds'], errors='coerce')

# 1) Neue Spalten anlegen
for color in ['red','blue']:
    df[f'{color}_wins_before']       = 0
    df[f'{color}_losses_before']     = 0
    for stat in ['kd','str','td','sub']:
        df[f'{stat}_{color}_avg_before'] = 0.0
    df[f'{color}_avg_rounds_before']  = 0.0
    df[f'{color}_avg_time_before']    = 0.0

# 2) Dictionaries initialisieren
wins       = {}
losses     = {}
stat_sums  = {}
stat_counts= {}
rounds_sum   = {}
rounds_count = {}
time_sum     = {}
time_count   = {}

# 3) Iteration über alle Kämpfe
for idx, row in df.iterrows():
    red   = row['fighter_red']
    blue  = row['fighter_blue']
    win   = row['winner']
    rnd   = row['round']
    tm    = row['time_seconds']

    # Hilfsfunktion: Dicts initialisieren
    def ensure(f):
        if f not in stat_sums:
            stat_sums[f]   = {s:0.0 for s in ['kd','str','td','sub']}
            stat_counts[f] = {s:0   for s in ['kd','str','td','sub']}
            rounds_sum[f]   = 0.0
            rounds_count[f] = 0
            time_sum[f]     = 0.0
            time_count[f]   = 0

    ensure(red)
    ensure(blue)

    # 3a) wins/losses before
    df.at[idx, 'red_wins_before']    = wins.get(red, 0)
    df.at[idx, 'red_losses_before']  = losses.get(red, 0)
    df.at[idx, 'blue_wins_before']   = wins.get(blue, 0)
    df.at[idx, 'blue_losses_before'] = losses.get(blue, 0)

    # 3b) avg_before für Stats, Runden, Zeit
    for fighter, color in [(red,'red'), (blue,'blue')]:
        # kd/str/td/sub
        for stat in ['kd','str','td','sub']:
            cnt = stat_counts[fighter][stat]
            s   = stat_sums[fighter][stat]
            df.at[idx, f'{stat}_{color}_avg_before'] = (s/cnt) if cnt else 0.0
        # rounds
        rc = rounds_count[fighter]
        df.at[idx, f'{color}_avg_rounds_before'] = (rounds_sum[fighter]/rc) if rc else 0.0
        # time
        tc = time_count[fighter]
        df.at[idx, f'{color}_avg_time_before']   = (time_sum[fighter]/tc) if tc else 0.0

    # 3c) wins/losses update nach Kampf
    if win == red:
        wins[red]    = wins.get(red, 0) + 1
        losses[blue] = losses.get(blue, 0) + 1
    elif win == blue:
        wins[blue]   = wins.get(blue, 0) + 1
        losses[red]  = losses.get(red, 0) + 1

    # 3d) Stats‑Sums und Counts update
    for fighter, color in [(red,'red'), (blue,'blue')]:
        for stat in ['kd','str','td','sub']:
            val = row[f'{stat}_{color}']
            if pd.notnull(val):
                stat_sums[fighter][stat]   += val
                stat_counts[fighter][stat] += 1
        if pd.notnull(rnd):
            rounds_sum[fighter]   += rnd
            rounds_count[fighter] += 1
        if pd.notnull(tm):
            time_sum[fighter]   += tm
            time_count[fighter] += 1

# 4) Ergebnis
df_fights_enriched = df


In [15]:
#all fights where fighter_red or fighter_blue is Max Holloway
df_fights_enriched[ (df_fights_enriched['fighter_red'] == 'Max Holloway') |
                                         (df_fights_enriched['fighter_blue'] == 'Max Holloway') ][['fight_link', 'event_date', 'event_name', 'fighter_red', 'fighter_blue',"winner","red_wins_before",
                                                                                                    #"red_losses_before","blue_losses_before", "blue_wins_before","kd_red_avg_before",
                                                                                                    #"str_red_avg_before",
                                                                                                   # "str_blue_avg_before","str_red","str_blue","red_age", "blue_age", "red_age_int", "blue_age_int",
                                                                                                    "fighter_red_elo_after", "fighter_blue_elo_after","elo_diff",
                                                                                                    "fighter_red_elo_before", "fighter_blue_elo_before"]].to_csv("testdf1.csv", index=False)

In [16]:
df_fights_enriched.columns

Index(['event_name', 'fighter_red', 'fighter_blue', 'winner', 'kd_red',
       'kd_blue', 'str_red', 'str_blue', 'td_red', 'td_blue', 'sub_red',
       'sub_blue', 'weight_class', 'method', 'round', 'time', 'fight_link',
       'red_height', 'red_weight', 'red_reach', 'red_stance', 'red_dob_dt',
       'blue_height', 'blue_weight', 'blue_reach', 'blue_stance',
       'blue_dob_dt', 'event_date', 'location', 'link',
       'fighter_red_elo_before', 'fighter_blue_elo_before', 'elo_diff',
       'fighter_red_elo_after', 'fighter_blue_elo_after',
       'fighter_red_elo_change', 'fighter_blue_elo_change', 'time_seconds',
       'kd_red_avg_before', 'str_red_avg_before', 'td_red_avg_before',
       'sub_red_avg_before', 'red_losses_before', 'red_avg_rounds_before',
       'red_avg_time_before', 'kd_blue_avg_before', 'str_blue_avg_before',
       'td_blue_avg_before', 'sub_blue_avg_before', 'blue_losses_before',
       'blue_avg_rounds_before', 'blue_avg_time_before', 'red_age', 'blue_age',


In [31]:
import pandas as pd
import numpy as np
import re

# Ausgangs-DF (nicht verändern)
df_orig = df_fights_enriched.copy()

# 1) Flip-Maske für ~50% aller Zeilen (reproduzierbar)
rng = np.random.default_rng(42)
flip_mask = rng.random(len(df_orig)) < 0.5
print("Gesamtzeilen:", len(df_orig))
print("Flip-Zeilen:", flip_mask.sum())

# 2) Arbeitskopie
df_mixed = df_orig.copy()

# 3) Paare sammeln
pairs = []

# Fighter-Namen separat hinzufügen
if {'fighter_red','fighter_blue'}.issubset(df_mixed.columns):
    pairs.append(('fighter_red','fighter_blue'))

# (a) Präfix-basierte Paare: red_*  ↔ blue_*
for col in df_mixed.columns:
    if col.startswith('red_'):
        blue = 'blue_' + col[4:]
        if blue in df_mixed.columns:
            pairs.append((col, blue))

# (b) Muster _red_  ↔ _blue_  (z.B. kd_red_avg_before)
for col in df_mixed.columns:
    if '_red_' in col:
        blue = col.replace('_red_','_blue_')
        if blue in df_mixed.columns:
            pairs.append((col, blue))

# (c) Suffix _red  ↔ _blue   (falls vorhanden – selten)
for col in df_mixed.columns:
    if col.endswith('_red'):
        blue = col[:-4] + '_blue'
        if blue in df_mixed.columns:
            pairs.append((col, blue))

# Duplikate entfernen (falls gleiche Paare mehrfach erkannt)
seen = set()
unique_pairs = []
for a,b in pairs:
    key = tuple(sorted((a,b)))
    if key not in seen:
        # Wir wollen sicherstellen, dass 'a' die "red"-Variante ist
        if not a.startswith('red') and b.startswith('red'):
            a,b = b,a
        unique_pairs.append((a,b))
        seen.add(key)

print("Anzahl eindeutiger Paare:", len(unique_pairs))
print("Beispiele:", unique_pairs[:5])

# 4) Einmaliges Swapping für alle flip-Zeilen
for red_col, blue_col in unique_pairs:
    red_vals  = df_mixed.loc[flip_mask, red_col].values
    blue_vals = df_mixed.loc[flip_mask, blue_col].values
    df_mixed.loc[flip_mask, red_col]  = blue_vals
    df_mixed.loc[flip_mask, blue_col] = red_vals

# 5) Prüf-Label (optional)
df_mixed['winner_is_red'] = (df_mixed['winner'] == df_mixed['fighter_red']).astype(int)

print(f"Vorher winner==fighter_red: { (df_orig['winner']==df_orig['fighter_red']).mean():.3f}")
print(f"Nachher winner==fighter_red: { df_mixed['winner_is_red'].mean():.3f}")

# 6) Resultat
# df_mixed ist dein „balancierter“ DataFrame


Gesamtzeilen: 8327
Flip-Zeilen: 4202
Anzahl eindeutiger Paare: 23
Beispiele: [('fighter_red', 'fighter_blue'), ('red_height', 'blue_height'), ('red_weight', 'blue_weight'), ('red_reach', 'blue_reach'), ('red_stance', 'blue_stance')]
Vorher winner==fighter_red: 1.000
Nachher winner==fighter_red: 0.495


In [18]:
df_mixed.columns

Index(['event_name', 'fighter_red', 'fighter_blue', 'winner', 'kd_red',
       'kd_blue', 'str_red', 'str_blue', 'td_red', 'td_blue', 'sub_red',
       'sub_blue', 'weight_class', 'method', 'round', 'time', 'fight_link',
       'red_height', 'red_weight', 'red_reach', 'red_stance', 'red_dob_dt',
       'blue_height', 'blue_weight', 'blue_reach', 'blue_stance',
       'blue_dob_dt', 'event_date', 'location', 'link',
       'fighter_red_elo_before', 'fighter_blue_elo_before', 'elo_diff',
       'fighter_red_elo_after', 'fighter_blue_elo_after',
       'fighter_red_elo_change', 'fighter_blue_elo_change', 'time_seconds',
       'kd_red_avg_before', 'str_red_avg_before', 'td_red_avg_before',
       'sub_red_avg_before', 'red_losses_before', 'red_avg_rounds_before',
       'red_avg_time_before', 'kd_blue_avg_before', 'str_blue_avg_before',
       'td_blue_avg_before', 'sub_blue_avg_before', 'blue_losses_before',
       'blue_avg_rounds_before', 'blue_avg_time_before', 'red_age', 'blue_age',


In [32]:
#show me all columns where fighter_red or fighter_blue is Max Holloway
df_mixed[(df_mixed['fighter_red'] == 'Max Holloway') | (df_mixed['fighter_blue'] == 'Max Holloway')].head(6)


Unnamed: 0,event_name,fighter_red,fighter_blue,winner,kd_red,kd_blue,str_red,str_blue,td_red,td_blue,...,blue_losses_before,blue_avg_rounds_before,blue_avg_time_before,red_age,blue_age,red_age_int,blue_age_int,red_wins_before,blue_wins_before,winner_is_red
1864,UFC 143: Diaz vs Condit,Dustin Poirier,Max Holloway,Dustin Poirier,0.0,0.0,11.0,11.0,1.0,0.0,...,0,0.0,0.0,23.041752,20.169747,23,20,3,0,1
1968,The Ultimate Fighter: Live Finale,Pat Schilling,Max Holloway,Max Holloway,0.0,1.0,27.0,118.0,0.0,0.0,...,1,1.0,203.0,23.759069,20.492813,23,20,0,0,0
2056,UFC 150: Henderson vs Edgar II,Max Holloway,Justin Lawrence,Max Holloway,1.0,0.0,29.0,32.0,0.0,1.0,...,0,3.0,19.0,20.687201,22.2423,20,22,1,1,1
2160,UFC 155: Dos Santos vs Velasquez II,Leonard Garcia,Max Holloway,Max Holloway,0.0,1.0,89.0,120.0,2.0,0.0,...,1,2.0,264.0,33.462012,21.0705,33,21,2,2,0
2314,UFC 160: Velasquez vs Silva 2,Max Holloway,Dennis Bermudez,Dennis Bermudez,0.0,0.0,75.0,73.0,0.0,4.0,...,1,2.0,293.5,21.472964,26.447639,21,26,3,3,0
2388,UFC Fight Night: Shogun vs Sonnen,Conor McGregor,Max Holloway,Conor McGregor,0.0,0.0,53.0,23.0,4.0,0.0,...,2,2.4,278.4,25.092402,21.702943,25,21,1,3,1


In [33]:
# -------------------------------------------------
# 0) Ausgangs-DataFrame: df_mixed (bereits mit Swaps)
# -------------------------------------------------

# --- Height & Reach Parser (VOR dem Copy) ---
def parse_height_to_inches(s):
    """
    Erwartete Formate: 6'5", 6' 5", 6'5, 5'11", etc.
    Gibt Gesamtgröße in Inches zurück oder NaN.
    """
    if pd.isna(s):
        return np.nan
    s = str(s).strip()
    m = re.match(r"^\s*(\d+)\s*'\s*(\d+)\s*", s)
    if not m:
        return np.nan
    feet = int(m.group(1))
    inch = int(m.group(2))
    return feet * 12 + inch

def parse_reach_to_inches(s):
    """
    Erwartete Formate: 75", 75, 75.5", 75.5
    Gibt Reach in Inches zurück (float) oder NaN.
    """
    if pd.isna(s):
        return np.nan
    s = str(s).strip()
    m = re.match(r"^\s*(\d+(?:\.\d+)?)", s)
    if not m:
        return np.nan
    return float(m.group(1))

# Normalisiere Fighter/Winner Strings
for c in ['fighter_red','fighter_blue','winner']:
    if c in df_mixed.columns:
        df_mixed[c] = df_mixed[c].astype(str).str.strip()

# Heights & Reaches parsen (INPLACE auf df_mixed, also vor dem Copy)
for col in ['red_height','blue_height']:
    if col in df_mixed.columns:
        df_mixed[col] = df_mixed[col].apply(parse_height_to_inches)

for col in ['red_reach','blue_reach']:
    if col in df_mixed.columns:
        df_mixed[col] = df_mixed[col].apply(parse_reach_to_inches)

# Weitere numerische Spalten in numeric casten (falls Strings)
num_like = [
    'red_weight','blue_weight',
    'fighter_red_elo_before','fighter_blue_elo_before','elo_diff',
    'kd_red_avg_before','str_red_avg_before','td_red_avg_before','sub_red_avg_before',
    'red_losses_before','red_avg_rounds_before','red_avg_time_before',
    'kd_blue_avg_before','str_blue_avg_before','td_blue_avg_before','sub_blue_avg_before',
    'blue_losses_before','blue_avg_rounds_before','blue_avg_time_before',
    'red_wins_before','blue_wins_before'
]
for col in num_like:
    if col in df_mixed.columns:
        df_mixed[col] = pd.to_numeric(df_mixed[col], errors='coerce')

In [34]:
# add column age_diff reach_diff win_diff height_diff kd_diff td_diff elo_diff str_diff sub_diff
df_mixed['age_diff'] = df_mixed['red_age'] - df_mixed['blue_age']
df_mixed['reach_diff'] = df_mixed['red_reach'] - df_mixed['blue_reach']
df_mixed['win_diff'] = df_mixed['red_wins_before'] - df_mixed['blue_wins_before']
df_mixed['height_diff'] = df_mixed['red_height'] - df_mixed['blue_height']
df_mixed['kd_diff'] = df_mixed['kd_red_avg_before'] - df_mixed['kd_blue_avg_before']
df_mixed['td_diff'] = df_mixed['td_red_avg_before'] - df_mixed['td_blue_avg_before']
#df_mixed['elo_diff'] = df_mixed['fighter_red_elo_before'] - df_mixed['fighter_blue_elo_before']
df_mixed['str_diff'] = df_mixed['str_red_avg_before'] - df_mixed['str_blue_avg_before']
df_mixed['sub_diff'] = df_mixed['sub_red_avg_before'] - df_mixed['sub_blue_avg_before']
df_mixed.head()

Unnamed: 0,event_name,fighter_red,fighter_blue,winner,kd_red,kd_blue,str_red,str_blue,td_red,td_blue,...,blue_wins_before,winner_is_red,age_diff,reach_diff,win_diff,height_diff,kd_diff,td_diff,str_diff,sub_diff
0,UFC 1: The Beginning,Gerard Gordeau,Teila Tuli,Gerard Gordeau,0.0,0.0,3.0,0.0,0.0,0.0,...,0,1,10.209446,,0,5.0,0.0,0.0,0.0,0.0
1,UFC 1: The Beginning,Gerard Gordeau,Royce Gracie,Royce Gracie,0.0,0.0,0.0,1.0,0.0,1.0,...,0,0,7.704312,,1,4.0,0.0,0.0,3.0,0.0
2,UFC 1: The Beginning,Jason DeLucia,Trent Jenkins,Jason DeLucia,0.0,0.0,3.0,1.0,1.0,0.0,...,0,1,,,0,-3.0,0.0,0.0,0.0,0.0
3,UFC 1: The Beginning,Royce Gracie,Ken Shamrock,Royce Gracie,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,-2.833676,,1,0.0,0.0,1.0,1.0,1.0
4,UFC 1: The Beginning,Zane Frazier,Kevin Rosier,Kevin Rosier,0.0,2.0,12.0,15.0,0.0,0.0,...,0,0,,,0,1.0,0.0,0.0,0.0,0.0


In [35]:
#all fights where fighter_red or fighter_blue is Max Holloway
df_mixed[ (df_mixed['fighter_red'] == 'Max Holloway') |
                                         (df_mixed['fighter_blue'] == 'Max Holloway') ][['fight_link', 'event_date', 'event_name', 'fighter_red', 'fighter_blue',"winner","red_wins_before",
                                                                                                    #"red_losses_before","blue_losses_before", "blue_wins_before","kd_red_avg_before",
                                                                                                    #"str_red_avg_before",
                                                                                                   # "str_blue_avg_before","str_red","str_blue","red_age", "blue_age", "red_age_int", "blue_age_int",
                                                                                                    "fighter_red_elo_after", "fighter_blue_elo_after","elo_diff",
                                                                                                    "fighter_red_elo_before", "fighter_blue_elo_before"]].to_csv("testdf3.csv", index=False)

In [36]:
# ---------- 1. DataFrame vorbereiten ----------
df = df_mixed.copy()  # oder df_fights_enriched / dein finales DF MIT winner_is_red
# Falls winner_is_red noch nicht existiert:
# df['winner_is_red'] = (df['winner'] == df['fighter_red']).astype(int)

# Datum sicherstellen
df['event_date'] = pd.to_datetime(df['event_date'])

# ---------- 2. Letztes Event als Hold-Out entfernen ----------
# Finde das maximale Datum
max_date = df['event_date'].max()
last_event_names = df.loc[df['event_date'] == max_date, 'event_name'].unique()

holdout_mask = df['event_name'].isin(last_event_names) & (df['event_date'] == max_date)
df_holdout_event = df[holdout_mask].copy()
df_rest = df[~holdout_mask].copy()

print(f"Hold-Out Event(s): {list(last_event_names)} – {len(df_holdout_event)} Kämpfe")
print(f"Restliche Kämpfe: {len(df_rest)}")

Hold-Out Event(s): ['UFC Fight Night: Whittaker vs. De Ridder'] – 12 Kämpfe
Restliche Kämpfe: 8315


In [37]:
# ---------- 3. Letzte 30 Events als Test-Set ----------
# Events nach Datum sortieren
events_sorted = (
    df_rest[['event_name','event_date']]
    .drop_duplicates()
    .sort_values('event_date')
)

last_30_events = events_sorted.tail(30)['event_name'].tolist()
test_mask = df_rest['event_name'].isin(last_30_events)

df_test = df_rest[test_mask].copy()
df_train = df_rest[~test_mask].copy()

print(f"Train-Events: {df_train['event_name'].nunique()}, Fights: {len(df_train)}")
print(f"Test-Events : {df_test['event_name'].nunique()}, Fights: {len(df_test)}")

Train-Events: 710, Fights: 7942
Test-Events : 30, Fights: 373


In [38]:
# ---------- 4. Feature-/Target-Definition ----------
feature_cols = [
    'weight_class', 'red_height', 'red_weight', 'red_reach', 'red_stance',
    'blue_height', 'blue_weight', 'blue_reach', 'blue_stance', 'event_date',
    'location', 'fighter_red_elo_before', 'fighter_blue_elo_before', 'elo_diff',
    'kd_red_avg_before', 'str_red_avg_before', 'td_red_avg_before',
    'sub_red_avg_before', 'red_losses_before', 'red_avg_rounds_before',
    'red_avg_time_before', 'kd_blue_avg_before', 'str_blue_avg_before',
    'td_blue_avg_before', 'sub_blue_avg_before', 'blue_losses_before',
    'blue_avg_rounds_before', 'blue_avg_time_before', 'red_wins_before',
    'blue_wins_before','age_diff', 'reach_diff', 'win_diff', 'height_diff',
    'kd_diff', 'td_diff', 'str_diff', 'sub_diff'
]

target_col = 'winner_is_red'

# ---------- 5. Zusätzliche Zeit-Features ----------
# (Option: könnte Performance verbessern)
for frame in [df_train, df_test, df_holdout_event]:
    frame['event_timestamp'] = frame['event_date'].view('int64') // 10**9  # Sekunden
    # Relatives Feature (Tage seit erstem Event im Training)
first_day = df_train['event_date'].min()
for frame in [df_train, df_test, df_holdout_event]:
    frame['days_since_first'] = (frame['event_date'] - first_day).dt.days

# Falls du nur diese zwei Zeitfeatures nutzen willst, kannst du 'event_date' aus feature_cols entfernen
# oder lassen (One-Hot von Datum wäre sinnlos). Wir ersetzen 'event_date' durch engineered features:
feature_cols = [c for c in feature_cols if c != 'event_date'] + ['event_timestamp', 'days_since_first']


  frame['event_timestamp'] = frame['event_date'].view('int64') // 10**9  # Sekunden
  frame['event_timestamp'] = frame['event_date'].view('int64') // 10**9  # Sekunden
  frame['event_timestamp'] = frame['event_date'].view('int64') // 10**9  # Sekunden


In [39]:
# ---------- 6. Aufteilen in X / y ----------
X_train = df_train[feature_cols].copy()
y_train = df_train[target_col].astype(int)

X_test = df_test[feature_cols].copy()
y_test = df_test[target_col].astype(int)

X_holdout = df_holdout_event[feature_cols].copy()

In [40]:
# ---------- 7. Spalten-Typen bestimmen ----------
categorical_cols = ['weight_class', 'red_stance', 'blue_stance', 'location']
numeric_cols = [c for c in feature_cols if c not in categorical_cols]

# ---------- 8. Preprocessing Pipeline ----------
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop'
)

In [41]:
# ---------- 9. XGBoost Modell ----------
xgb_model = XGBClassifier(
    n_estimators=600,
    learning_rate=0.03,
    max_depth=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective='binary:logistic',
    eval_metric='logloss',
    n_jobs=-1,
    random_state=42
)

clf = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', xgb_model)
])

# ---------- 10. Training ----------
clf.fit(X_train, y_train)



0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [42]:
# -------------------------------------------------
# 9) Evaluation
# -------------------------------------------------
y_proba_test = clf.predict_proba(X_test)[:,1]
y_pred_test  = (y_proba_test >= 0.5).astype(int)

print("\n--- Test Metrics ---")
print("AUC      :", roc_auc_score(y_test, y_proba_test))
print("Accuracy :", accuracy_score(y_test, y_pred_test))
print("LogLoss  :", log_loss(y_test, y_proba_test))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))

# -------------------------------------------------
# 10) Hold-out Predictions
# -------------------------------------------------
holdout_proba = clf.predict_proba(X_holdout)[:,1]
holdout_pred  = (holdout_proba >= 0.5).astype(int)

df_holdout_pred = df_holdout_event.copy()
df_holdout_pred['pred_winner_is_red_proba'] = holdout_proba
df_holdout_pred['pred_winner_is_red'] = holdout_pred
df_holdout_pred['predicted_winner_name'] = np.where(
    df_holdout_pred['pred_winner_is_red']==1,
    df_holdout_pred['fighter_red'],
    df_holdout_pred['fighter_blue']
)

print("\n--- Hold-out Vorhersagen ---")
print(df_holdout_pred[['event_name','fighter_red','fighter_blue','winner',
                       'pred_winner_is_red_proba','predicted_winner_name']])


--- Test Metrics ---
AUC      : 0.9857027556785426
Accuracy : 0.9276139410187667
LogLoss  : 0.1625551784854499

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.92       177
           1       0.94      0.92      0.93       196

    accuracy                           0.93       373
   macro avg       0.93      0.93      0.93       373
weighted avg       0.93      0.93      0.93       373


--- Hold-out Vorhersagen ---
                                    event_name        fighter_red  \
8315  UFC Fight Night: Whittaker vs. De Ridder        Davey Grant   
8316  UFC Fight Night: Whittaker vs. De Ridder           Petr Yan   
8317  UFC Fight Night: Whittaker vs. De Ridder       Martin Buday   
8318  UFC Fight Night: Whittaker vs. De Ridder      Steven Nguyen   
8319  UFC Fight Night: Whittaker vs. De Ridder      Tabatha Ricci   
8320  UFC Fight Night: Whittaker vs. De Ridder  Said Nurmagomedov   
8321  UFC Fight Night: 



In [75]:
df_holdout_pred[['event_name','fighter_red','fighter_blue','winner',
                       'pred_winner_is_red_proba','predicted_winner_name']]

Unnamed: 0,event_name,fighter_red,fighter_blue,winner,pred_winner_is_red_proba,predicted_winner_name
8315,UFC Fight Night: Whittaker vs. De Ridder,Davey Grant,Da'Mon Blackshear,Davey Grant,0.306285,Da'Mon Blackshear
8316,UFC Fight Night: Whittaker vs. De Ridder,Petr Yan,Marcus McGhee,Petr Yan,0.990161,Petr Yan
8317,UFC Fight Night: Whittaker vs. De Ridder,Martin Buday,Marcus Buchecha,Martin Buday,0.996644,Martin Buday
8318,UFC Fight Night: Whittaker vs. De Ridder,Steven Nguyen,Mohammad Yahya,Mohammad Yahya,0.161214,Mohammad Yahya
8319,UFC Fight Night: Whittaker vs. De Ridder,Tabatha Ricci,Amanda Ribas,Amanda Ribas,0.031717,Amanda Ribas
8320,UFC Fight Night: Whittaker vs. De Ridder,Said Nurmagomedov,Bryce Mitchell,Bryce Mitchell,0.566031,Said Nurmagomedov
8321,UFC Fight Night: Whittaker vs. De Ridder,Muslim Salikhov,Carlos Leal,Muslim Salikhov,0.983575,Muslim Salikhov
8322,UFC Fight Night: Whittaker vs. De Ridder,Bogdan Guskov,Nikita Krylov,Nikita Krylov,0.02069,Nikita Krylov
8323,UFC Fight Night: Whittaker vs. De Ridder,Asu Almabayev,Ramazan Temirov,Asu Almabayev,0.972375,Asu Almabayev
8324,UFC Fight Night: Whittaker vs. De Ridder,Shara Magomedov,Marc-Andre Barriault,Shara Magomedov,0.987573,Shara Magomedov


In [43]:
# 1) Alles in EIN DataFrame mit den Meta-Infos (Namen, Event, …)
df_test_pred = df_test.reset_index(drop=True).copy()
df_test_pred['y_true']  = y_test.to_numpy().astype(int)
df_test_pred['y_proba'] = y_proba_test
df_test_pred['y_pred']  = y_pred_test.astype(int)

# Wer laut Modell gewinnt (Name):
df_test_pred['predicted_winner_name'] = np.where(
    df_test_pred['y_pred'] == 1,
    df_test_pred['fighter_red'],
    df_test_pred['fighter_blue']
)

# Korrekt oder nicht?
df_test_pred['correct'] = (df_test_pred['y_true'] == df_test_pred['y_pred']).astype(int)

In [None]:
df_test_pred["event_name","fighter_red","fighter_blue","y_true","y_proba","y_pred","predicted_winner_name","correct"].to_csv("test_predictions.csv", index=False)

Unnamed: 0,event_name,fighter_red,fighter_blue,winner,kd_red,kd_blue,str_red,str_blue,td_red,td_blue,...,td_diff,str_diff,sub_diff,event_timestamp,days_since_first,y_true,y_proba,y_pred,predicted_winner_name,correct
359,UFC 318: Holloway vs. Poirier 3,Francisco Prado,Nikolay Veretennikov,Nikolay Veretennikov,0.0,0.0,47.0,56.0,3.0,2.0,...,0.25,26.25,0.25,1752883200,11572,0,0.773753,1,Francisco Prado,0
360,UFC 318: Holloway vs. Poirier 3,Nicolle Caliari,Carli Judice,Carli Judice,0.0,1.0,32.0,109.0,1.0,0.0,...,2.0,-26.0,0.0,1752883200,11572,0,0.013195,0,Carli Judice,1
361,UFC 318: Holloway vs. Poirier 3,Jackson McVey,Brunno Ferreira,Brunno Ferreira,0.0,0.0,4.0,7.0,0.0,1.0,...,-0.5,-20.666667,-0.333333,1752883200,11572,0,0.011695,0,Brunno Ferreira,1
362,UFC 318: Holloway vs. Poirier 3,Jimmy Crute,Marcin Prachnio,Jimmy Crute,0.0,0.0,5.0,21.0,3.0,0.0,...,1.8,-31.4,0.8,1752883200,11572,1,0.997197,1,Jimmy Crute,1
363,UFC 318: Holloway vs. Poirier 3,Adam Fugitt,Islam Dulatov,Islam Dulatov,0.0,1.0,14.0,21.0,0.0,0.0,...,1.25,43.5,0.0,1752883200,11572,0,0.059243,0,Islam Dulatov,1
364,UFC 318: Holloway vs. Poirier 3,Ateba Gautier,Robert Valentin,Ateba Gautier,1.0,0.0,19.0,2.0,0.0,0.0,...,0.0,16.0,-1.0,1752883200,11572,1,0.996903,1,Ateba Gautier,1
365,UFC 318: Holloway vs. Poirier 3,Ryan Spann,Lukasz Brzeski,Ryan Spann,0.0,0.0,8.0,5.0,1.0,0.0,...,0.5,-25.261905,0.571429,1752883200,11572,1,0.998012,1,Ryan Spann,1
366,UFC 318: Holloway vs. Poirier 3,Marvin Vettori,Brendan Allen,Brendan Allen,0.0,0.0,109.0,111.0,0.0,3.0,...,0.75,41.422794,-0.382353,1752883200,11572,0,0.025887,0,Brendan Allen,1
367,UFC 318: Holloway vs. Poirier 3,Vinicius Oliveira,Kyler Phillips,Vinicius Oliveira,1.0,0.0,96.0,49.0,1.0,0.0,...,-0.375,-1.083333,-0.041667,1752883200,11572,1,0.487921,0,Kyler Phillips,0
368,UFC 318: Holloway vs. Poirier 3,Patricio Freire,Dan Ige,Patricio Freire,0.0,0.0,55.0,47.0,5.0,0.0,...,0.368421,-29.263158,0.947368,1752883200,11572,1,0.983089,1,Patricio Freire,1
