In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from prophet import Prophet
from pmdarima import auto_arima
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [2]:
df = pd.read_csv("../data/Summer-Olympic-medals-1976-to-2008.csv", encoding='ISO-8859-1')
gold_medals = df[df['Medal'] == 'Gold']
gold_medal_counts = gold_medals.groupby(['Country', 'Year']).size().unstack(fill_value=0)

FileNotFoundError: [Errno 2] No such file or directory: '../data/Summer-Olympic-medals-1976-to-2008.csv'

In [None]:
country = "United States"
country_data = df[df['Country'] == country].groupby('Year').size().reset_index(name='y')
country_data.columns = ['ds', 'y']
country_data['ds'] = pd.to_datetime(country_data['ds'], format='%Y')

In [None]:
cutoff_year = 1996
train_data = country_data[country_data['ds'].dt.year <= cutoff_year]
test_data = country_data[country_data['ds'].dt.year > cutoff_year]

In [None]:
model = Prophet(yearly_seasonality=True)
model.fit(train_data)

In [None]:
future_years = 5
future = model.make_future_dataframe(periods=future_years * 4, freq='Y')
forecast = model.predict(future)
comparison = pd.merge(test_data, forecast[['ds', 'yhat']], how='left', on='ds')

In [None]:
test_data = test_data.dropna(subset=['y'])
comparison = comparison.dropna(subset=['yhat'])
min_length = min(len(test_data), len(comparison))
test_data = test_data.head(min_length)
comparison = comparison.head(min_length)


In [None]:
arima_model = auto_arima(train_data['y'], seasonal=False, trace=True)
arima_forecast = arima_model.predict(n_periods=future_years)
arima_future_dates = pd.date_range(start=train_data['ds'].max(), periods=future_years, freq='Y')
arima_forecast_df = pd.DataFrame({'ds': arima_future_dates, 'yhat': arima_forecast})

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_train_data = scaler.fit_transform(train_data[['y']])
X_train, y_train = [], []
time_step = 4
for i in range(time_step, len(scaled_train_data)):
    X_train.append(scaled_train_data[i-time_step:i, 0])
    y_train.append(scaled_train_data[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))

In [None]:
lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(units=50, return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(units=1))
lstm_model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
scaled_test_data = scaler.transform(test_data[['y']])
X_test, y_test = [], []
for i in range(time_step, len(scaled_test_data)):
    X_test.append(scaled_test_data[i-time_step:i, 0])
    y_test.append(scaled_test_data[i, 0])
X_test, y_test = np.array(X_test), np.array(y_test)
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
lstm_predictions = lstm_model.predict(X_test)
lstm_predictions = scaler.inverse_transform(lstm_predictions)

In [None]:
test_data = test_data.dropna(subset=['y'])
comparison = comparison.dropna(subset=['yhat'])
min_length = min(len(test_data), len(comparison))
test_data = test_data.head(min_length)
comparison = comparison.head(min_length)
arima_forecast = arima_forecast[:min_length]
lstm_predictions = lstm_predictions[:min_length]

In [None]:
prophet_mae = mean_absolute_error(test_data['y'], comparison['yhat'])
prophet_rmse = np.sqrt(mean_squared_error(test_data['y'], comparison['yhat']))
prophet_mape = mean_absolute_percentage_error(test_data['y'], comparison['yhat'])

print(f'Prophet MAE: {prophet_mae}, RMSE: {prophet_rmse}, MAPE: {prophet_mape}')

arima_mae = mean_absolute_error(test_data['y'], arima_forecast)
arima_rmse = np.sqrt(mean_squared_error(test_data['y'], arima_forecast))
arima_mape = mean_absolute_percentage_error(test_data['y'], arima_forecast)

print(f'ARIMA MAE: {arima_mae}, RMSE: {arima_rmse}, MAPE: {arima_mape}')

lstm_mae = mean_absolute_error(test_data['y'], lstm_predictions)
lstm_rmse = np.sqrt(mean_squared_error(test_data['y'], lstm_predictions))
lstm_mape = mean_absolute_percentage_error(test_data['y'], lstm_predictions)

print(f'LSTM MAE: {lstm_mae}, RMSE: {lstm_rmse}, MAPE: {lstm_mape}')

In [None]:
plt.figure(figsize=(12, 6))

# Plot actual data
plt.plot(country_data['ds'], country_data['y'], label='Actual', marker='o')

# Plot Prophet forecast
plt.plot(forecast['ds'], forecast['yhat'], label='Prophet Forecast', linestyle='--')

# Plot ARIMA forecast
plt.plot(arima_forecast_df['ds'], arima_forecast_df['yhat'], label='ARIMA Forecast', linestyle='--')

# Plot LSTM forecast
future_dates = pd.date_range(start=test_data['ds'].min(), periods=len(lstm_predictions), freq='Y')
plt.plot(future_dates, lstm_predictions, label='LSTM Forecast', linestyle='--')

# Highlight the test period
plt.axvspan(test_data['ds'].min(), test_data['ds'].max(), color='gray', alpha=0.3)

plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.title(f'Olympic Medal Predictions for {country}')
plt.legend()
plt.grid(True)
plt.show()