In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Load the datasets
athletes = pd.read_csv('summerOly_athletes.csv')
hosts = pd.read_csv('summerOly_hosts.csv')
medal_counts = pd.read_csv('summerOly_medal_counts.csv')
programs = pd.read_csv('summerOly_programs.csv', encoding='cp1252')

print("Data loaded successfully!")
print(f"Athletes shape: {athletes.shape}")
print(f"Medal counts shape: {medal_counts.shape}")
print(f"Programs shape: {programs.shape}")

# Create NOC mapping between datasets
medal_counts['NOC'] = medal_counts['NOC'].str.strip().str.replace('\xa0', '', regex=False)
medal_counts['NOC'] = medal_counts['NOC'].str.replace(r'\s+', ' ', regex=True)
athletes['NOC'] = athletes['NOC'].str.strip()
athletes['Team'] = athletes['Team'].str.strip().str.replace('\xa0', '', regex=False)

# Comprehensive NOC mapping
noc_mapping = {
    'USA': 'United States', 'GBR': 'Great Britain', 'GER': 'Germany', 'CHN': 'China',
    'RUS': 'Russia', 'JPN': 'Japan', 'AUS': 'Australia', 'ITA': 'Italy', 
    'FRA': 'France', 'CAN': 'Canada', 'NED': 'Netherlands', 'KOR': 'South Korea',
    'ESP': 'Spain', 'BRA': 'Brazil', 'CUB': 'Cuba', 'NZL': 'New Zealand',
    'ROU': 'Romania', 'POL': 'Poland', 'HUN': 'Hungary', 'KEN': 'Kenya',
    'NOR': 'Norway', 'SWE': 'Sweden', 'DEN': 'Denmark', 'FIN': 'Finland',
    'RSA': 'South Africa', 'UKR': 'Ukraine', 'CZE': 'Czech Republic', 'BEL': 'Belgium',
    'AUT': 'Austria', 'SUI': 'Switzerland', 'ARG': 'Argentina', 'MEX': 'Mexico',
    'GRE': 'Greece', 'SRB': 'Serbia', 'JAM': 'Jamaica', 'TUR': 'Turkey',
    'IRI': 'Iran', 'BLR': 'Belarus', 'ETH': 'Ethiopia', 'CRO': 'Croatia',
    'TTO': 'Trinidad and Tobago', 'IND': 'India', 'POR': 'Portugal', 'COL': 'Colombia'
}

# Create historical features
def create_historical_features(df, years_back=[1, 2, 3]):
    df = df.sort_values(['NOC', 'Year'])
    feature_cols = ['Gold', 'Silver', 'Bronze', 'Total']
    
    for col in feature_cols:
        # Moving averages
        df[f'{col}_MA3'] = df.groupby('NOC')[col].transform(
            lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
        )
        # Lagged features
        for lag in years_back:
            df[f'{col}_lag{lag}'] = df.groupby('NOC')[col].shift(lag)
        # Trend features
        df[f'{col}_trend'] = df.groupby('NOC')[col].transform(
            lambda x: x.diff().rolling(window=2, min_periods=1).mean().shift(1)
        )
    
    # Years since first medal
    df['first_medal_year'] = df.groupby('NOC')['Year'].transform('min')
    df['years_competing'] = df['Year'] - df['first_medal_year']
    df['olympics_participated'] = df.groupby('NOC').cumcount() + 1
    
    return df

medal_features = create_historical_features(medal_counts.copy())

# Merge host information
medal_features = medal_features.merge(hosts, on='Year', how='left')
medal_features['is_host'] = (medal_features['NOC'] == medal_features['Host']).astype(int)
medal_features.drop('Host', axis=1, inplace=True)

# Add post-host effect
medal_features = medal_features.sort_values(['NOC', 'Year'])
medal_features['was_host_prev'] = medal_features.groupby('NOC')['is_host'].shift(1).fillna(0)
medal_features['was_host_prev2'] = medal_features.groupby('NOC')['is_host'].shift(2).fillna(0)

# Count number of events per year
events_per_year = programs.iloc[:, 4:].notna().sum()
events_df = pd.DataFrame({
    'Year': [int(col) if col != '1906*' else 1906 for col in events_per_year.index],
    'total_events': events_per_year.values
})

# Merge with medal features
medal_features = medal_features.merge(events_df, on='Year', how='left')
medal_features['medals_per_event'] = medal_features['Total'] / medal_features['total_events']
medal_features['medals_per_event_MA3'] = medal_features.groupby('NOC')['medals_per_event'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean().shift(1)
)

# Filter modern era and create complete dataset
medal_features = medal_features[medal_features['Year'] >= 1960]
all_years = sorted(medal_features['Year'].unique())
all_nocs = sorted(medal_features['NOC'].unique())

# Get additional countries from athletes data
athletes_countries = athletes.groupby('NOC').agg({
    'Year': ['min', 'max', 'nunique'],
    'Name': 'count'
}).reset_index()
athletes_countries.columns = ['NOC', 'first_year', 'last_year', 'num_olympics', 'num_athletes']
athletes_countries['NOC_mapped'] = athletes_countries['NOC'].map(noc_mapping).fillna(athletes_countries['NOC'])

additional_countries = []
for _, row in athletes_countries.iterrows():
    if row['NOC_mapped'] not in all_nocs and row['NOC'] not in all_nocs:
        if row['last_year'] >= 1960:
            additional_countries.append(row['NOC_mapped'])

all_nocs.extend(additional_countries)
all_nocs = sorted(list(set(all_nocs)))

# Create complete panel
from itertools import product
complete_panel = pd.DataFrame(list(product(all_nocs, all_years)), columns=['NOC', 'Year'])
full_data = complete_panel.merge(medal_features, on=['NOC', 'Year'], how='left')

# Fill missing values
medal_cols = ['Gold', 'Silver', 'Bronze', 'Total', 'Rank']
full_data[medal_cols] = full_data[medal_cols].fillna(0)

fill_cols = ['first_medal_year', 'years_competing', 'olympics_participated']
full_data[fill_cols] = full_data.groupby('NOC')[fill_cols].ffill()

lag_cols = [col for col in full_data.columns if 'lag' in col or 'MA' in col or 'trend' in col]
full_data[lag_cols] = full_data[lag_cols].fillna(0)
full_data = full_data.fillna(0)

# Fix values for countries that never won medals
never_won_mask = full_data.groupby('NOC')['Total'].transform('sum') == 0
full_data.loc[never_won_mask, 'first_medal_year'] = 0
full_data.loc[never_won_mask, 'years_competing'] = 0

# Select features
feature_cols = [col for col in full_data.columns if col not in 
                ['NOC', 'Year', 'Gold', 'Silver', 'Bronze', 'Total', 'Rank']]

# Create country encoder
from sklearn.preprocessing import LabelEncoder
country_encoder = LabelEncoder()
full_data['NOC_encoded'] = country_encoder.fit_transform(full_data['NOC'])
feature_cols.append('NOC_encoded')

# Split data - using 2024 as test set
test_year = 2024
train_data = full_data[full_data['Year'] < 2024]
test_data = full_data[full_data['Year'] == 2024]

X_train = train_data[feature_cols]
X_test = test_data[feature_cols]
y_train_total = train_data['Total']
y_test_total = test_data['Total']
y_train_gold = train_data['Gold']
y_test_gold = test_data['Gold']

print(f"\nTraining data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# Define Random Forest configurations with even stronger regularization
rf_configs = {
    'RF Default': RandomForestRegressor(
        n_estimators=30,  # Fewer trees
        max_depth=3,  # Very shallow
        min_samples_split=50,  # Need many samples to split
        min_samples_leaf=25,  # Large minimum leaf size
        max_features=0.3,  # Use only 30% of features
        random_state=42
    ),
    'RF Deep': RandomForestRegressor(
        n_estimators=50,  # Still limited trees
        max_depth=4,  # Slightly deeper
        min_samples_split=40,
        min_samples_leaf=20,
        max_features=0.4,  # Use 40% of features
        random_state=42
    ),
    'RF Shallow': RandomForestRegressor(
        n_estimators=20,  # Very few trees
        max_depth=2,  # Extremely shallow
        min_samples_split=60,  # Very high split requirement
        min_samples_leaf=30,  # Very large leaves
        max_features=0.2,  # Use only 20% of features
        random_state=42
    )
}

# Function to evaluate model
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    r2 = r2_score(y_test, predictions)
    return mae, rmse, r2

# Train and evaluate models for Total Medals
print("\n" + "="*70)
print("Training Random Forest models for Total Medals:")
print("-"*70)

total_results = {}
for name, model in rf_configs.items():
    model.fit(X_train, y_train_total)
    mae, rmse, r2 = evaluate_model(model, X_test, y_test_total)
    total_results[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}
    print(f"{name:<20} MAE: {mae:>6.2f}  RMSE: {rmse:>6.2f}  R²: {r2:>6.3f}")

# Train and evaluate models for Gold Medals
print("\n" + "="*70)
print("Training Random Forest models for Gold Medals:")
print("-"*70)

gold_results = {}
for name, model in rf_configs.items():
    model.fit(X_train, y_train_gold)
    mae, rmse, r2 = evaluate_model(model, X_test, y_test_gold)
    gold_results[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}
    print(f"{name:<20} MAE: {mae:>6.2f}  RMSE: {rmse:>6.2f}  R²: {r2:>6.3f}")

Data loaded successfully!
Athletes shape: (252565, 9)
Medal counts shape: (1435, 7)
Programs shape: (74, 35)

Training data shape: (5424, 30)
Test data shape: (339, 30)

Training Random Forest models for Total Medals:
----------------------------------------------------------------------
RF Default           MAE:   0.83  RMSE:   3.34  R²:  0.913
RF Deep              MAE:   0.46  RMSE:   2.25  R²:  0.961
RF Shallow           MAE:   1.49  RMSE:   3.92  R²:  0.881

Training Random Forest models for Gold Medals:
----------------------------------------------------------------------
RF Default           MAE:   0.42  RMSE:   1.39  R²:  0.876
RF Deep              MAE:   0.36  RMSE:   1.33  R²:  0.886
RF Shallow           MAE:   0.58  RMSE:   1.58  R²:  0.840
