In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns  # optional, if you like

# Read dataset
df = pd.read_csv("./cleaned_concatenated_with_gold.csv")

# Make a safe copy of the original data if needed
df_original = df.copy()

# If your CSV has an 'index' column that’s just repeated indices, drop it
# (If 'index' is not an irrelevant column, remove the drop line)
df.drop(columns=['index'], errors='ignore', inplace=True)

# 2.1: Winning Ratio
df['Winning_Ratio'] = df.apply(
    lambda row: row['Gold'] / row['Total_Athletes'] 
                if row['Total_Athletes'] > 0 else 0,
    axis=1
)

# 2.2: Keep a simple mapping of [NOC, Year, Total_Medal_Count] for merges/visuals
mapping_df = df[['NOC', 'Year', 'Gold']].copy()

# 2.3: Sort data by NOC, then by Year for lag-based features
df.sort_values(by=['NOC', 'Year'], inplace=True)
df.reset_index(drop=True, inplace=True)

# Create lag features
lags = [1, 2, 3]
for lag in lags:
    df[f'Medals_lag_{lag}'] = df.groupby('NOC')['Gold'].shift(lag)

# Rolling mean of the last 3 observations
df['Rolling_Mean_3'] = df.groupby('NOC')['Gold'].transform(lambda x: x.rolling(3).mean())

# Drop any rows that now contain NaN (due to shift/rolling)
df.dropna(inplace=True)


split_year = 1996
df_train = df[df['Year'] <= split_year].copy()
df_test  = df[df['Year'] > split_year].copy()

mapping_df_train = mapping_df[mapping_df['Year'] <= split_year].copy()
mapping_df_test  = mapping_df[mapping_df['Year'] > split_year].copy()


# Choose relevant features for modeling
features = [
    'Total_Athletes',
    'host_status',
    'Rolling_Mean_3',
    'Winning_Ratio',
    'Medals_lag_1'
]

X_train = df_train[features]
y_train = df_train['Gold']

X_test = df_test[features]
y_test = df_test['Gold']

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape:", y_test.shape)

model = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate on test set
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R²: {r2}")


X_train shape: (4460, 5)
y_train shape: (4460,)
X_test shape:  (1561, 5)
y_test shape: (1561,)
MAE: 0.1877362633885649
MSE: 0.9439189981913092
R²: 0.9534388296582315


In [2]:
import pandas as pd

# Let's say you have a list of NOCs
all_nocs = df['NOC'].unique()

future_data = []

for noc in all_nocs:
    # Build a single row (dict) of features for this NOC in 2028
    row_2028 = {}

    row_2028['NOC'] = noc

    # 1. Estimate or set total athletes. You could base it on the 2024 value or your own logic.
    latest_2024_data = df[(df['NOC'] == noc) & (df['Year'] == 2024)]
    if not latest_2024_data.empty:
        row_2028['Total_Athletes'] = latest_2024_data['Total_Athletes'].values[0]
    else:
        row_2028['Total_Athletes'] = 0  # or some default / estimate

    # 2. host_status for 2028: if the host is the USA and the NOC is 'USA', set 1, else 0
    row_2028['host_status'] = 1 if noc == 'USA' else 0

    # 3. Rolling_Mean_3: we need the last 3 actual medal counts from df. 
    #    If you have real data for 2024, 2020, 2016, you can compute the average:
    last_three_counts = df.loc[(df['NOC'] == noc) & (df['Year'].isin([2024, 2020, 2016])), 'Gold']
    if len(last_three_counts) == 3:
        row_2028['Rolling_Mean_3'] = last_three_counts.mean()
    else:
        row_2028['Rolling_Mean_3'] = 0  # or some fallback

    # 4. Winning_Ratio for 2028 is tricky if it's derived from the 2028 medal count.
    #    If your model uses same-year ratio, that’s a direct leak. 
    #    But if you want to fill something anyway:
    row_2028['Winning_Ratio'] = 0  # or use the 2024 ratio as an approximation
    if not latest_2024_data.empty:
        row_2028['Winning_Ratio'] = latest_2024_data['Winning_Ratio'].values[0]  

    # 5. Medals_lag_2: if 'lag=2' refers to the count from 2020 or 2016
    #    You can check how you created the lag in your original dataset:
    row_2028['Medals_lag_1'] = 0
    data_2_lags_ago = df.loc[(df['NOC'] == noc) & (df['Year'] == 2024), 'Gold']
    if not data_2_lags_ago.empty:
        row_2028['Medals_lag_1'] = data_2_lags_ago.values[0]

    future_data.append(row_2028)

# Create a new dataframe for 2028 predictions
df_2028 = pd.DataFrame(future_data)


In [3]:
features = [
    'Total_Athletes',
    'host_status',
    'Rolling_Mean_3',
    'Winning_Ratio',
    'Medals_lag_1'
]

X_2028 = df_2028[features]
y_pred_2028 = model.predict(X_2028)


In [4]:
df_2028_results = df_2028.copy()
df_2028_results['Predicted_Gold_Medals_2028'] = y_pred_2028

print(df_2028_results.head())

              NOC  Total_Athletes  host_status  Rolling_Mean_3  Winning_Ratio  \
0     Afghanistan               6            0        0.000000       0.000000   
1         Albania               8            0        0.000000       0.000000   
2         Algeria              46            0        0.666667       0.043478   
3  American Samoa               2            0        0.000000       0.000000   
4         Andorra               2            0        0.000000       0.000000   

   Medals_lag_1  Predicted_Gold_Medals_2028  
0             0                         0.0  
1             0                         0.0  
2             2                         1.5  
3             0                         0.0  
4             0                         0.0  


In [9]:
df_2028_results = df_2028_results.sort_values(by='Predicted_Gold_Medals_2028', ascending=False)
print(df_2028_results.head(10))

                NOC  Total_Athletes  host_status  Rolling_Mean_3  \
211   United States             619            0       41.666667   
41            China             396            0       34.666667   
98            Japan             430            0       19.666667   
10        Australia             474            0       14.333333   
69           France             601            0       12.000000   
73          Germany             457            0       13.000000   
136     Netherlands             288            0       11.000000   
210  United Kingdom               0            0       21.000000   
95            Italy             397            0       10.000000   
137     New Zealand             208            0        7.000000   

     Winning_Ratio  Medals_lag_1  Predicted_Gold_Medals_2028  
211       0.064620            40                       38.81  
41        0.101010            40                       36.45  
98        0.046512            20                       24.02  

In [11]:
# Assuming your DataFrame is called 'df' and is already sorted
df_top50 = df_2028_results.head(75)
df_top50.to_csv('top75_Gold_predictions.csv', index=False)

In [6]:
df_2028_results.to_csv("prediction_Gold_2028.csv",index=False)

In [7]:
import pandas as pd

# Read the CSV with total medals predictions
df_medals = pd.read_csv("prediction_2028_2.csv")

# Read the CSV with gold medal predictions
df_gold = pd.read_csv("prediction_Gold_2028.csv")


In [8]:
# If you only need the 'NOC' and 'Predicted_Gold_Medals_2028' columns from df_gold:
df_merged = pd.merge(
    df_medals, 
    df_gold[['NOC', 'Predicted_Gold_Medals_2028']], 
    on='NOC', 
    how='inner'
)
df_merged

Unnamed: 0,NOC,Total_Athletes,host_status,Rolling_Mean_3,Winning_Ratio,Medals_lag_1,Predicted_Medals_2028,Predicted_Gold_Medals_2028
0,United States,619,0,120.000000,0.203554,126,90.73,38.81
1,China,396,0,83.333333,0.229798,91,86.55,36.45
2,France,601,0,46.333333,0.106489,64,68.37,15.79
3,Japan,430,0,48.000000,0.104651,45,66.16,24.02
4,Australia,474,0,42.666667,0.111814,53,64.96,21.15
...,...,...,...,...,...,...,...,...
218,Latvia,29,0,0.666667,0.000000,0,0.00,0.00
219,Lebanon,9,0,0.000000,0.000000,0,0.00,0.00
220,Lesotho,3,0,0.000000,0.000000,0,0.00,0.00
221,Liberia,8,0,0.000000,0.000000,0,0.00,0.00
