# Predicting 100m Freestyle Gold Medal Time for the 2028 Olympics
This notebook demonstrates how to predict the gold medal time for the 100m freestyle event at the 2028 Olympics using advanced ML techniques (LightGBM, StandardScaler, MSE minimization).

In [1]:
!pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Using cached lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [4]:
# --- Robust pipeline for 100m freestyle gold medal prediction (with scaling, no PCA) ---
# This cell replaces the previous pipeline and avoids PCA, which is not needed for 4 features and can cause instability on small test sets.
# It also ensures proper encoding and scaling, and checks for NaN/infinite values.

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# 1. Load data
file_path = '../data/Olympic_Swimming.csv'
df = pd.read_csv(file_path)

# 2. Convert Results to seconds
def time_to_seconds(t):
    import re
    if pd.isna(t): return np.nan
    t = str(t).strip()
    if re.fullmatch(r'\d+(\.\d+)?', t):
        return float(t)
    if re.fullmatch(r'\d+:\d+(\.\d+)?', t):
        m, s = t.split(':')
        return int(m)*60 + float(s)
    if re.fullmatch(r'\d+:\d+:\d+(\.\d+)?', t):
        h, m, s = t.split(':')
        return int(h)*3600 + int(m)*60 + float(s)
    return np.nan

df['result_seconds'] = df['Results'].apply(time_to_seconds)
df['distance_m'] = df['Distance (in meters)'].str.extract(r'(\d+)').astype(float)

# 3. Filter for 100m freestyle gold medalists only
data = df[(df['distance_m'] == 100) & (df['Stroke'] == 'Freestyle') & (df['Rank'] == 1)]
data = data.dropna(subset=['result_seconds', 'Year', 'Gender', 'Team', 'Location'])

# 4. Encode categorical variables
for col in ['Gender', 'Team', 'Location']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

# 5. Prepare features and target
X = data[['Year', 'Gender', 'Team', 'Location']].astype(float)
y = data['result_seconds'].astype(float)

# 6. Train/test split (all but 2020 for train, 2020 for test)
train = data[data['Year'] < 2020]
test = data[data['Year'] == 2020]
X_train, y_train = train[['Year', 'Gender', 'Team', 'Location']], train['result_seconds']
X_test, y_test = test[['Year', 'Gender', 'Team', 'Location']], test['result_seconds']

# 7. Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 8. Train LightGBM regressor
lgbm = lgb.LGBMRegressor(objective='regression', n_estimators=100, random_state=42)
lgbm.fit(X_train_scaled, y_train)

# 9. Evaluate model
if len(X_test_scaled) > 0:
    y_pred = lgbm.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    print(f'MSE on 2020 test set: {mse:.3f}')
    print('2020 actual times:', list(y_test))
    print('2020 predicted times:', list(y_pred))
else:
    print('No 2020 test data available.')

# 10. Predict for 2028 (using most common values for categorical features)
future = pd.DataFrame({
    'Year': [2028],
    'Gender': [data['Gender'].mode()[0]],
    'Team': [data['Team'].mode()[0]],
    'Location': [data['Location'].mode()[0]]
})
future_scaled = scaler.transform(future)
pred_2028 = lgbm.predict(future_scaled)[0]
print(f'Predicted 100m freestyle gold medal time for 2028: {pred_2028:.2f} seconds')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000013 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 39
[LightGBM] [Info] Number of data points in the train set: 50, number of used features: 4
[LightGBM] [Info] Start training from score 56.818801
MSE on 2020 test set: 5.221
2020 actual times: [47.02, 51.96]
2020 predicted times: [48.45703431554359, 54.85437222174171]
Predicted 100m freestyle gold medal time for 2028: 48.30 seconds
