In [None]:
"""
Forecasts: 
1) PREVC Domen
2) NIKAIDO Ren
3) KOBAYASHI Ryoyu
4) RAIMUND Philipp
5) FOUBERT Valentin 

Description of the model and variables used: 
The Ski Jumping prediction model is based on logistic regression, with the target variable indicating whether a jumper finishes in the Top 5 of a World Cup event. 
The dataset consists of official FIS World Cup results from the 2022-2026 seasons. All features are constructed using only historical information to avoid data leakage.

The model includes the following variables:
Prev_Elo - a lagged Elo-like rating capturing the jumper's competitive strength, updated after each competition and shifted by one event.
Top5_Rate_3 - the rolling average share of Top-5 finishes over the previous three competitions.
Std_Rank_3 - the rolling standard deviation of finishing positions over the previous three competitions, measuring performance volatility.
Rank_Trend_3 - a short-term performance trend computed from changes in finishing positions over the previous three competitions.
"""

In [1]:
# Create raw_df
import pandas as pd

raw_df = pd.read_csv('/Users/timur/Downloads/merged_csv.csv')

# Feature Engineering and Data Cleaning
df = raw_df.copy()

df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y")

df['Name'] = df['Name'].str.strip('. ')

df = df[df['Round'] == 1].copy()
df.drop(columns=['Round'], inplace=True)

df = df.sort_values(['Name', 'Date'])

# Compute features
df['Std_Rank_3'] = df.groupby('Name')['Rank'].shift(1).rolling(3, min_periods=1).std()
df['Std_Rank_3'] = df['Std_Rank_3'].fillna(df['Std_Rank_3'].mean())

df['Top5'] = (df['Rank'] <= 5).astype(int)

df['Top5_Rate_3'] = (df.groupby('Name')['Top5'].shift(1).rolling(3, min_periods=1).mean()).fillna(0)

df['Top5'] = (df['Rank'] <= 5).astype(int)

df['Elo'] = 1500.0

K = 20

for date, comp in df.groupby('Date'):
    mean_elo = comp['Elo'].mean()

    for idx, row in comp.iterrows():
        expected_rank = mean_elo - row['Elo']
        actual_rank = row['Rank'] - comp['Rank'].mean()

        df.loc[idx, 'Elo'] += K * (expected_rank - actual_rank)

df['Prev_Elo'] = df.groupby('Name')['Elo'].shift(1)
df['Prev_Elo'] = df['Prev_Elo'].fillna(1500)


nat_counts = df['Nat'].value_counts()
common_nats = nat_counts[nat_counts >= 50 ].index

df['Nat'] = df['Nat'].where(df['Nat'].isin(common_nats), 'Other')
df = pd.get_dummies(df, columns=['Nat'], drop_first=True)

bool_cols = df.select_dtypes(include='boolean').columns
df[bool_cols] = df[bool_cols].astype(int)

df['Rank_Trend_3'] = (
    df.groupby('Name')['Rank']
      .shift(1)
      .rolling(3, min_periods=2)
      .apply(lambda x: x.iloc[-1] - x.iloc[0])
).fillna(0)


df_model = df.drop(columns=['Season', 'HS', 'Speed', 'Dist', 'Dist_Pts', 'Judges_Tot', 'Gate', 'Wind_ms', 
                      'Wind_Pts', 'Round_Rank', 'Source_File', 'Round_Total', 'Total', 'Rank', 'Loc', 'Elo', 'Bib'])

df_model.head(10)


Unnamed: 0,Date,Name,Std_Rank_3,Top5,Top5_Rate_3,Prev_Elo,Nat_FIN,Nat_GER,Nat_ITA,Nat_JPN,Nat_NOR,Nat_Other,Nat_POL,Nat_SLO,Nat_SUI,Nat_USA,Rank_Trend_3
3128,2021-11-28,AALTO Antti,7.610736,0,0.0,1500.0,1,0,0,0,0,0,0,0,0,0,0.0
2722,2022-03-03,AALTO Antti,7.610736,0,0.0,1074.583333,1,0,0,0,0,0,0,0,0,0,0.0
2635,2022-03-05,AALTO Antti,7.071068,0,0.0,1264.489796,1,0,0,0,0,0,0,0,0,0,0.0
2570,2022-11-05,AALTO Antti,8.082904,0,0.0,1388.8,1,0,0,0,0,0,0,0,0,0,-16.0
2476,2022-11-06,AALTO Antti,7.549834,0,0.0,1088.8,1,0,0,0,0,0,0,0,0,0,9.0
2412,2022-11-26,AALTO Antti,8.386497,0,0.0,1369.2,1,0,0,0,0,0,0,0,0,0,1.0
2307,2022-11-27,AALTO Antti,8.386497,0,0.0,1069.6,1,0,0,0,0,0,0,0,0,0,1.0
2225,2022-12-17,AALTO Antti,10.816654,0,0.0,1488.8,1,0,0,0,0,0,0,0,0,0,-6.0
2067,2023-01-15,AALTO Antti,12.423097,0,0.0,1509.2,1,0,0,0,0,0,0,0,0,0,-22.0
2000,2023-03-14,AALTO Antti,0.57735,0,0.0,1490.0,1,0,0,0,0,0,0,0,0,0,0.0


In [2]:
# Feature selection
X = df_model.drop(columns=['Date', 'Top5', 'Name'], errors='ignore')
y = df_model['Top5']

corr = X.corrwith(y).sort_values(key=abs, ascending=False)
corr

Top5_Rate_3     0.500703
Prev_Elo        0.386022
Nat_Other      -0.110277
Nat_GER         0.106335
Std_Rank_3     -0.082012
Nat_FIN        -0.080176
Nat_USA        -0.074525
Rank_Trend_3   -0.073670
Nat_ITA        -0.058512
Nat_POL        -0.052080
Nat_SUI        -0.050880
Nat_SLO         0.045422
Nat_NOR        -0.007807
Nat_JPN         0.003077
dtype: float64

In [3]:
# Mutual Info
from sklearn.feature_selection import mutual_info_classif

mi = mutual_info_classif(X, y, discrete_features="auto", random_state=42)

mi_scores = pd.Series(mi, index=X.columns).sort_values(ascending=False)

mi_scores

Prev_Elo        0.097498
Top5_Rate_3     0.092864
Std_Rank_3      0.022090
Nat_GER         0.014062
Nat_SLO         0.006752
Nat_NOR         0.006506
Nat_JPN         0.005493
Nat_POL         0.005343
Rank_Trend_3    0.005285
Nat_Other       0.001967
Nat_SUI         0.001349
Nat_FIN         0.000000
Nat_ITA         0.000000
Nat_USA         0.000000
dtype: float64

In [4]:
selected_features = mi_scores[mi_scores > 0.005].index.tolist()

selected_features = [f for f in selected_features if not f.startswith('Nat_')]

X_filtered = X[selected_features]

In [5]:
X_full = X_filtered
y_full = y

In [6]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(
    max_iter=5000,
    class_weight="balanced",
    random_state=42
)

logit.fit(X_full, y_full)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,5000


In [7]:
pre_event = df_model.sort_values('Date').groupby('Name').tail(1).copy()

In [8]:
last_date = df['Date'].max()

active_names = (
    df[df['Date'] >= last_date - pd.Timedelta(days=60)]
    ['Name']
    .unique()
)

pre_event = pre_event[pre_event['Name'].isin(active_names)]

In [9]:
X_pred = pre_event[selected_features]
X_pred = X_pred.reindex(columns=X_full.columns, fill_value=0)

In [10]:
pre_event['p_top5'] = logit.predict_proba(X_pred)[:, 1]

In [11]:
engelberg_pred = (
    pre_event[['Name', 'p_top5']]
    .sort_values('p_top5', ascending=False)
    .head(5)
)

engelberg_pred

Unnamed: 0,Name,p_top5
0,PREVC Domen,0.957858
4,NIKAIDO Ren,0.95187
10,KOBAYASHI Ryoyu,0.951466
6,RAIMUND Philipp,0.856859
22,FOUBERT Valentin,0.811167
