In [1]:
import pandas as pd
import numpy as np
import math
from datetime import timedelta
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, roc_auc_score

# ==============================================================================
# 1. FUNKCJE POMOCNICZE
# ==============================================================================

def get_ewma(series, span):
    return series.ewm(span=span, adjust=False).mean()

def evaluate_with_tolerance(y_true, y_proba, player_ids, dates, threshold, tolerance_days=3):
    eval_df = pd.DataFrame({
        'player_name': player_ids.values,
        'date': pd.to_datetime(dates.values),
        'target': y_true.values,
        'proba': y_proba
    })
    eval_df['pred'] = (eval_df['proba'] >= threshold).astype(int)
    
    tp, fn, fp, tn = 0, 0, 0, 0
    
    for player, group in eval_df.groupby('player_name'):
        group = group.sort_values('date')
        injury_dates = group[group['target'] == 1]['date'].tolist()
        alarm_dates = group[group['pred'] == 1]['date'].tolist()
        
        # TP & FN
        matched_alarms = set()
        for inj_date in injury_dates:
            is_detected = False
            for alarm_date in alarm_dates:
                if abs((inj_date - alarm_date).days) <= tolerance_days:
                    is_detected = True
                    matched_alarms.add(alarm_date)
                    break
            if is_detected:
                tp += 1
            else:
                fn += 1
        
        # FP
        for alarm_date in alarm_dates:
            if alarm_date not in matched_alarms:
                is_justified = False
                for inj_date in injury_dates:
                    if abs((inj_date - alarm_date).days) <= tolerance_days:
                        is_justified = True
                        break
                if not is_justified:
                    fp += 1
        
        # TN
        for idx, row in group.iterrows():
            curr_date = row['date']
            in_window = False
            for inj_date in injury_dates:
                if abs((inj_date - curr_date).days) <= tolerance_days:
                    in_window = True
                    break
            if not in_window and row['pred'] == 0:
                tn += 1

    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    # F4 Score (High Safety)
    beta = 4
    f_beta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall + 1e-10)
    
    # MCC
    numerator = (tp * tn) - (fp * fn)
    denominator = math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    mcc = numerator / denominator if denominator > 0 else 0
    
    return {
        'TP': tp, 'FN': fn, 'FP': fp, 'TN': tn,
        'F_Beta': f_beta, 'MCC': mcc,
        'Recall': recall, 'Precision': precision, 'Specificity': specificity
    }

# ==============================================================================
# 2. PRZYGOTOWANIE DANYCH
# ==============================================================================
print("1/7 Wczytywanie i czyszczenie...")
df = pd.read_csv("outputC.csv")
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['player_name', 'date'])

cols_to_fix = ['heart_rate_mean', 'heart_rate_max', 'speed_mean', 'daily_load', 'fatigue', 'soreness']
for col in cols_to_fix:
    if col in df.columns:
        df[col] = df[col].replace(0, np.nan)
        df[col] = df[col].fillna(df.groupby('player_name')[col].transform('median'))
        if df[col].isna().sum() > 0:
            df[col] = df[col].fillna(df[col].median())
        df[col] = df[col].fillna(0)

print("2/7 Feature Engineering...")
grouped = df.groupby('player_name')['daily_load']
df['atl_ewma'] = grouped.transform(lambda x: get_ewma(x, 7))
df['ctl_ewma'] = grouped.transform(lambda x: get_ewma(x, 28))
df['acwr_ewma'] = df['atl_ewma'] / (df['ctl_ewma'] + 1)
roll_mean = grouped.transform(lambda x: x.rolling(7).mean())
roll_std = grouped.transform(lambda x: x.rolling(7).std())
df['monotony_calc'] = roll_mean / (roll_std + 0.1)
df['strain_calc'] = df['daily_load'] * df['monotony_calc']
df['daily_jump'] = df['daily_load'] / (grouped.shift(1) + 1)

features_to_lag = ['daily_load', 'fatigue', 'soreness', 'acwr_ewma', 'monotony_calc', 'strain_calc', 'daily_jump']
for col in features_to_lag:
    df[f'{col}_lag1'] = df.groupby('player_name')[col].shift(1)

df = df.dropna(subset=['acwr_ewma_lag1'])

print("3/7 Trening Modelu...")
cols_exclude = ['player_name', 'date', 'target', 'has_injury', 'has_illness', 
                'timestamp_x', 'timestamp_y', 'problems_x', 'problems_y', 
                'type_x', 'type_y', 'target_x', 'target_y', 'year']
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
features = [c for c in numeric_cols if c not in cols_exclude]

train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df['target'], random_state=42, shuffle=True
)

X_train = train_df[features]
y_train = train_df['target']
X_test = test_df[features]
y_test = test_df['target']

# Waga 15 - Safety First
sample_weight = np.where(y_train == 1, 15, 1)
model = HistGradientBoostingClassifier(learning_rate=0.03, max_depth=5, l2_regularization=0.5, random_state=42)
model.fit(X_train, y_train, sample_weight=sample_weight)

# ==============================================================================
# 3. OPTYMALIZACJA (F4 SCORE)
# ==============================================================================
print("4/7 Optymalizacja progu (F4 Score)...")
y_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba)

precision_curve, recall_curve, thresholds = precision_recall_curve(y_test, y_proba)
beta = 4
f_beta_scores = (1 + beta**2) * (precision_curve * recall_curve) / ((beta**2 * precision_curve) + recall_curve + 1e-10)
best_thresh = thresholds[np.argmax(f_beta_scores)]

print(f"   -> Wybrano próg: {best_thresh:.4f}")

metrics = evaluate_with_tolerance(
    y_test, y_proba, test_df['player_name'], test_df['date'], 
    threshold=best_thresh, tolerance_days=3
)

# ==============================================================================
# 4. ANALIZA TOP 10 (NOWOŚĆ!!!)
# ==============================================================================
print("5/7 Generowanie TOP 10 zagrożonych...")

results_df = test_df[['player_name', 'date', 'target']].copy()
results_df['injury_probability'] = y_proba
results_df['prediction'] = (y_proba >= best_thresh).astype(int)

# Sortujemy po ryzyku
top_10 = results_df.sort_values('injury_probability', ascending=False).head(10).copy()

# Funkcja sprawdzająca czy kontuzja była w pobliżu (nie tylko w ten sam dzień)
def check_nearby_injury(row, df_full, days=3):
    # Szukamy tego samego gracza
    player_data = df_full[df_full['player_name'] == row['player_name']]
    # Szukamy kontuzji w oknie +/- days
    nearby_injuries = player_data[
        (player_data['target'] == 1) & 
        (player_data['date'] >= row['date'] - timedelta(days=days)) &
        (player_data['date'] <= row['date'] + timedelta(days=days))
    ]
    return len(nearby_injuries) > 0

# Aplikujemy logikę
# Używamy test_df jako źródła prawdy
top_10['Real_Injury_Nearby'] = top_10.apply(lambda row: check_nearby_injury(row, test_df, 3), axis=1)

# Formatowanie do wyświetlenia
top_10_display = top_10[['player_name', 'date', 'injury_probability', 'target', 'Real_Injury_Nearby']].copy()
top_10_display['injury_probability'] = (top_10_display['injury_probability'] * 100).round(2).astype(str) + '%'
top_10_display['date'] = top_10_display['date'].dt.date

# ==============================================================================
# 5. EKSPORT PLIKÓW
# ==============================================================================
print("6/7 Zapisywanie plików...")
X_train_full = X_train.copy()
X_train_full['target'] = y_train
X_train_full.to_csv("train_data_final.csv", index=False)
X_test.to_csv("test_features_final.csv", index=False)
pd.DataFrame(y_test).to_csv("test_labels_true_final.csv", index=False)
pd.DataFrame((y_proba >= best_thresh).astype(int), columns=['prediction']).to_csv("test_labels_pred_final.csv", index=False)
pd.DataFrame(y_proba, columns=['injury_probability']).to_csv("test_proba_final.csv", index=False)

results_df['risk_level'] = pd.cut(results_df['injury_probability'], bins=[-1, 0.05, 0.30, 1.0], labels=['Low', 'Medium', 'High'])
results_df.to_csv("test_results_complete.csv", index=False)

# Zapiszmy też Top 10 do pliku dla wygody
top_10_display.to_csv("top_10_risks.csv", index=False)

print("\n" + "="*60)
print("       TOP 10 ZAGROŻONYCH ZAWODNIKÓW")
print("="*60)
print(top_10_display.to_string(index=False))
print("-" * 60)
print("LEGENDA:")
print("* injury_probability: Jak bardzo model jest pewny kontuzji.")
print("* target: Czy kontuzja wpisana jest DOKŁADNIE w ten dzień (1=Tak, 0=Nie).")
print("* Real_Injury_Nearby: Czy kontuzja zdarzyła się w oknie +/- 3 dni (Prawdziwy Test).")
print("  -> Jeśli 'True', to znaczy, że system zadziałał perfekcyjnie!")
print("="*60)

print("\n" + "="*60)
print("       EWALUACJA MODELU")
print("="*60)
print(f"TP (Sukces):                 {metrics['TP']}")
print(f"FN (Przegapione):            {metrics['FN']}")
print(f"FP (Fałszywe Alarmy):        {metrics['FP']}")
print(f"FP (Prawidłowe braki kontuzji):        {metrics['TN']}")
print(f"RECALL (Czułość):            {metrics['Recall']:.4f}")
print(f"ROC AUC:                     {roc_auc:.4f}")
print("="*60)

1/7 Wczytywanie i czyszczenie...
2/7 Feature Engineering...
3/7 Trening Modelu...
4/7 Optymalizacja progu (F4 Score)...
   -> Wybrano próg: 0.0828
5/7 Generowanie TOP 10 zagrożonych...
6/7 Zapisywanie plików...

       TOP 10 ZAGROŻONYCH ZAWODNIKÓW
                               player_name       date injury_probability  target  Real_Injury_Nearby
TeamA-4051bba7-1170-4c43-b912-8c38815a7625 2020-08-05             91.55%       0               False
TeamA-4051bba7-1170-4c43-b912-8c38815a7625 2020-06-17             91.55%       1                True
TeamA-4051bba7-1170-4c43-b912-8c38815a7625 2020-08-12              91.0%       1                True
TeamA-5cd7a61b-88b2-46d2-94f8-5a0d4f682d93 2020-06-05             90.49%       1                True
TeamA-4051bba7-1170-4c43-b912-8c38815a7625 2020-06-19              90.3%       1                True
TeamA-4051bba7-1170-4c43-b912-8c38815a7625 2020-06-08             89.85%       1                True
TeamA-4051bba7-1170-4c43-b912-8c38815a7625 2