In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from wordcloud import WordCloud

from sklearn.preprocessing import LabelEncoder
import pickle

In [34]:
url_atheletes = './data/cleaned_summerOly_atheletes.csv'
url_medals = './data/cleaned_summerOly_medal_count.csv'
url_programs = './data/cleaned_summerOly_programs.csv'

In [35]:
# Load dataset
df_atheletes = pd.read_csv(url_atheletes)

df_programs = pd.read_csv(url_programs)

merged_df = pd.merge(df_atheletes, df_programs, on=['Year', 'Sport'], how='left')

# Fill NaN values with 0
merged_df.fillna(0, inplace=True)

# Convert column to integer
merged_df['num_games'] = merged_df['num_games'].astype(int)

merged_df

Unnamed: 0,NOC,Year,Sport,num_athletes,Medal_Gold,Medal_Silver,Medal_Bronze,num_female_athletes,num_male_athletes,num_games
0,Denmark,1896,Fencing,1,0,0,1,0,1,3
1,Greece,1896,Weightlifting,3,0,0,2,0,3,2
2,Greece,1896,Tennis,7,0,3,1,0,7,2
3,Greece,1896,Swimming,9,1,3,2,0,9,0
4,Greece,1896,Shooting,27,3,3,3,0,27,5
...,...,...,...,...,...,...,...,...,...,...
25374,Hungary,2024,Tennis,2,0,0,0,0,2,5
25375,Hungary,2024,Triathlon,3,0,0,0,1,2,3
25376,Cameroon,2024,Table Tennis,1,0,0,0,1,0,5
25377,Hungary,2024,Sailing,2,0,0,0,1,1,10


In [36]:
# Now merge with medals
df_medals = pd.read_csv(url_medals)

# Merge DataFrames based on 'NOC' and 'Year'
medals_merged = pd.merge(merged_df, df_medals, on=['NOC', 'Year'], how='left')

# Fill NaN values with 0
medals_merged.fillna(0, inplace=True)

# Convert column to integer
medals_merged['Rank'] = medals_merged['Rank'].astype(int)
medals_merged['Gold'] = medals_merged['Gold'].astype(int)
medals_merged['Silver'] = medals_merged['Silver'].astype(int)
medals_merged['Bronze'] = medals_merged['Bronze'].astype(int)
medals_merged['Total'] = medals_merged['Total'].astype(int)

# Rename 'Total' to 'num_games' in the merged DataFrame
medals_merged

Unnamed: 0,NOC,Year,Sport,num_athletes,Medal_Gold,Medal_Silver,Medal_Bronze,num_female_athletes,num_male_athletes,num_games,Rank,Gold,Silver,Bronze,Total
0,Denmark,1896,Fencing,1,0,0,1,0,1,3,9,1,2,3,6
1,Greece,1896,Weightlifting,3,0,0,2,0,3,2,2,10,18,19,47
2,Greece,1896,Tennis,7,0,3,1,0,7,2,2,10,18,19,47
3,Greece,1896,Swimming,9,1,3,2,0,9,0,2,10,18,19,47
4,Greece,1896,Shooting,27,3,3,3,0,27,5,2,10,18,19,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25374,Hungary,2024,Tennis,2,0,0,0,0,2,5,14,6,7,6,19
25375,Hungary,2024,Triathlon,3,0,0,0,1,2,3,14,6,7,6,19
25376,Cameroon,2024,Table Tennis,1,0,0,0,1,0,5,0,0,0,0,0
25377,Hungary,2024,Sailing,2,0,0,0,1,1,10,14,6,7,6,19


In [38]:
# Now combine with hosts
url_hosts = './data/cleaned_concatenated_with_host_and_future_status.csv'
df_hosts = pd.read_csv(url_hosts)

df_hosts.drop(['index','Total_Medal_Count'], axis=1, inplace=True)

hosts_merged = pd.merge(medals_merged, df_hosts, on=['NOC', 'Year'], how='left')

# Fill NaN values with 0
hosts_merged.fillna(0, inplace=True)

# Convert column to integer
hosts_merged['Total_Athletes'] = hosts_merged['Total_Athletes'].astype(int)
hosts_merged['host_status'] = hosts_merged['host_status'].astype(int)
hosts_merged['future_host_status'] = hosts_merged['future_host_status'].astype(int)

hosts_merged = hosts_merged.rename(columns={'num_athletes': 'sport_num_athletes', 'Medal_Gold': 'sport_medal_gold', 
                                           'Medal_Silver': 'sport_medal_silver', 'Medal_Bronze': 'sport_medal_bronze',
                                            'num_female_athletes': 'sport_num_female_athletes', 'num_male_athletes': 'sport_num_male_athletes',
                                            'num_games':'num_games_sports_each_year', 'Rank': 'rank_country_year', 'Gold':'total_gold_country_year',
                                            'Silver':'total_silver_country_year', 'Bronze': 'total_bronze_country_year', 'Total': 'total_medal_country_year',
                                            'Total_Athletes': 'total_athelete_country_year'
                                           })

hosts_merged.to_csv()

hosts_merged.to_csv('./data/final_data.csv',index=False)

In [39]:
hosts_merged

Unnamed: 0,NOC,Year,Sport,sport_num_athletes,sport_medal_gold,sport_medal_silver,sport_medal_bronze,sport_num_female_athletes,sport_num_male_athletes,num_games_sports_each_year,rank_country_year,total_gold_country_year,total_silver_country_year,total_bronze_country_year,total_medal_country_year,total_athelete_country_year,host_status,future_host_status
0,Denmark,1896,Fencing,1,0,0,1,0,1,3,9,1,2,3,6,3,0,0
1,Greece,1896,Weightlifting,3,0,0,2,0,3,2,2,10,18,19,47,102,1,0
2,Greece,1896,Tennis,7,0,3,1,0,7,2,2,10,18,19,47,102,1,0
3,Greece,1896,Swimming,9,1,3,2,0,9,0,2,10,18,19,47,102,1,0
4,Greece,1896,Shooting,27,3,3,3,0,27,5,2,10,18,19,47,102,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25374,Hungary,2024,Tennis,2,0,0,0,0,2,5,14,6,7,6,19,177,0,0
25375,Hungary,2024,Triathlon,3,0,0,0,1,2,3,14,6,7,6,19,177,0,0
25376,Cameroon,2024,Table Tennis,1,0,0,0,1,0,5,0,0,0,0,0,6,0,0
25377,Hungary,2024,Sailing,2,0,0,0,1,1,10,14,6,7,6,19,177,0,0
