In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_excel("olympic_medals.xlsx", index_col=0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21697 entries, 0 to 21696
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   discipline_title       21697 non-null  object
 1   slug_game              21697 non-null  object
 2   event_title            21697 non-null  object
 3   event_gender           21697 non-null  object
 4   medal_type             21697 non-null  object
 5   participant_type       21697 non-null  object
 6   participant_title      6584 non-null   object
 7   athlete_url            17027 non-null  object
 8   athlete_full_name      18073 non-null  object
 9   country_name           21697 non-null  object
 10  country_code           20195 non-null  object
 11  country_3_letter_code  21697 non-null  object
dtypes: object(12)
memory usage: 2.2+ MB


In [5]:
# Valeurs nulles
df.isnull().sum()


discipline_title             0
slug_game                    0
event_title                  0
event_gender                 0
medal_type                   0
participant_type             0
participant_title        15113
athlete_url               4670
athlete_full_name         3624
country_name                 0
country_code              1502
country_3_letter_code        0
dtype: int64

In [6]:
# remove useless columns
df = df.drop(columns=['participant_title', 'athlete_url', 'athlete_full_name', 'country_code', 'country_name'])
df.columns

Index(['discipline_title', 'slug_game', 'event_title', 'event_gender',
       'medal_type', 'participant_type', 'country_3_letter_code'],
      dtype='object')

In [7]:
# Visualiser les doublons
duplicates = df.duplicated()
df[duplicates]
# Il n'y a pas de doublons

Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,participant_type,country_3_letter_code
1,Curling,beijing-2022,Mixed Doubles,Mixed,GOLD,GameTeam,ITA
3,Curling,beijing-2022,Mixed Doubles,Mixed,SILVER,GameTeam,NOR
5,Curling,beijing-2022,Mixed Doubles,Mixed,BRONZE,GameTeam,SWE
82,Snowboard,beijing-2022,Mixed Team Snowboard Cross,Mixed,GOLD,GameTeam,USA
84,Snowboard,beijing-2022,Mixed Team Snowboard Cross,Mixed,SILVER,GameTeam,ITA
...,...,...,...,...,...,...,...
21602,Athletics,athens-1896,high jump men,Men,SILVER,Athlete,USA
21633,Gymnastics Artistic,athens-1896,team parallel bars men,Men,BRONZE,GameTeam,GRE
21683,Tennis,athens-1896,doubles men,Men,GOLD,GameTeam,MIX
21685,Tennis,athens-1896,doubles men,Men,SILVER,GameTeam,GRE


In [8]:
# Enlever les éditions d'hiver
df['slug_game'].unique()

winter_games = [
    'beijing-2022', 'pyeongchang-2018', 'sochi-2014', 'vancouver-2010', 
    'turin-2006', 'salt-lake-city-2002', 'nagano-1998', 'lillehammer-1994', 
    'albertville-1992', 'calgary-1988', 'sarajevo-1984', 'lake-placid-1980', 
    'innsbruck-1976', 'sapporo-1972', 'grenoble-1968', 'innsbruck-1964', 
    'squaw-valley-1960', 'cortina-d-ampezzo-1956', 'oslo-1952', 
    'st-moritz-1948', 'garmisch-partenkirchen-1936', 'lake-placid-1932', 
    'st-moritz-1928', 'chamonix-1924'
]

# Les éditions qui ne sont pas dans les jeux d'hiver
summer_games = df[~df['slug_game'].isin(winter_games)]
summer_games

Unnamed: 0,discipline_title,slug_game,event_title,event_gender,medal_type,participant_type,country_3_letter_code
355,Shooting,tokyo-2020,Trap Mixed Team,Mixed,GOLD,GameTeam,ESP
356,Shooting,tokyo-2020,Trap Mixed Team,Mixed,GOLD,GameTeam,ESP
357,Shooting,tokyo-2020,Trap Mixed Team,Mixed,SILVER,GameTeam,SMR
358,Shooting,tokyo-2020,Trap Mixed Team,Mixed,SILVER,GameTeam,SMR
359,Shooting,tokyo-2020,Trap Mixed Team,Mixed,BRONZE,GameTeam,USA
...,...,...,...,...,...,...,...
21692,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,SILVER,Athlete,DEN
21693,Weightlifting,athens-1896,heavyweight - one hand lift men,Men,BRONZE,Athlete,GRE
21694,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,GOLD,Athlete,DEN
21695,Weightlifting,athens-1896,heavyweight - two hand lift men,Men,SILVER,Athlete,GBR


In [9]:
# Filter les données à partir de 2000

# Création d'une colonne year en se basant sur la colonne slug_game
df['year'] = df['slug_game'].str.extract(r'(\d{4})').astype(int)
del df['slug_game']

In [10]:
df = df[df['year'] >= 2000]
df

Unnamed: 0,discipline_title,event_title,event_gender,medal_type,participant_type,country_3_letter_code,year
0,Curling,Mixed Doubles,Mixed,GOLD,GameTeam,ITA,2022
1,Curling,Mixed Doubles,Mixed,GOLD,GameTeam,ITA,2022
2,Curling,Mixed Doubles,Mixed,SILVER,GameTeam,NOR,2022
3,Curling,Mixed Doubles,Mixed,SILVER,GameTeam,NOR,2022
4,Curling,Mixed Doubles,Mixed,BRONZE,GameTeam,SWE,2022
...,...,...,...,...,...,...,...
8183,Taekwondo,57 - 67 kg women,Women,SILVER,Athlete,NOR,2000
8184,Taekwondo,57 - 67 kg women,Women,BRONZE,Athlete,JPN,2000
8185,Taekwondo,49 kg women,Women,GOLD,Athlete,AUS,2000
8186,Taekwondo,49 kg women,Women,SILVER,Athlete,CUB,2000


In [11]:
# Supprimer les disciplines qui ne sont pas dans la prochaine édition


# Trouver les disciplines unique tous JO confondus
disciplines = df['discipline_title'].unique()
disciplines

# Seules ces discplines sont présentes dans l'édition 2024
disciplines_2024 = ['Figure skating',
       'Shooting', 'Diving', 'Canoe Sprint', 'Cycling Road', 'Football', 'Boxing',
       'Artistic Swimming', 'Handball', 'Rugby Sevens',
       'Cycling BMX Racing', 'Triathlon', 'Surfing', 'Table Tennis',
       'Canoe Slalom', 'Marathon Swimming', 'Trampoline Gymnastics',
       'Volleyball', 'Basketball', 'Taekwondo', 'Cycling Track',
       'Fencing', 'Badminton', 'Water Polo', 'Sport Climbing',
       'Wrestling', 'Tennis', 'Artistic Gymnastics', 'Golf',
       'Cycling BMX Freestyle', 'Judo', 'Skateboarding', 'Archery',
       'Weightlifting',
       'Modern Pentathlon', 'Athletics', 'Swimming', 'Sailing',
       'Cycling Mountain Bike', 'Rowing', '3x3 Basketball',
       'Rhythmic Gymnastics', 'Hockey', 'Beach Volleyball',
       'Cycling BMX', 'Rugby', 'Gymnastics Rhythmic', 'Gymnastics Artistic',
       'Synchronized Swimming', 'Trampoline'
    ]

# Ne garder que les données en rapport avec ces disciplines
df = df[df['discipline_title'].isin(disciplines_2024)]
df

df['discipline_title'].unique()


array(['Figure skating', 'Shooting', 'Diving', 'Canoe Sprint',
       'Cycling Road', 'Football', 'Boxing', 'Artistic Swimming',
       'Handball', 'Rugby Sevens', 'Cycling BMX Racing', 'Triathlon',
       'Surfing', 'Table Tennis', 'Canoe Slalom', 'Marathon Swimming',
       'Trampoline Gymnastics', 'Volleyball', 'Basketball', 'Taekwondo',
       'Cycling Track', 'Fencing', 'Badminton', 'Water Polo',
       'Sport Climbing', 'Wrestling', 'Tennis', 'Artistic Gymnastics',
       'Golf', 'Cycling BMX Freestyle', 'Judo', 'Skateboarding',
       'Archery', 'Weightlifting', 'Modern Pentathlon', 'Athletics',
       'Swimming', 'Sailing', 'Cycling Mountain Bike', 'Rowing',
       '3x3 Basketball', 'Rhythmic Gymnastics', 'Hockey',
       'Beach Volleyball', 'Cycling BMX', 'Rugby', 'Gymnastics Rhythmic',
       'Gymnastics Artistic', 'Synchronized Swimming', 'Trampoline'],
      dtype=object)

In [12]:
# Rassembler Gymnastics Rhythmic et Rhythmic Gymnastics dans une même colonne
df.loc[:,'discipline_title'] = df['discipline_title'].replace('Gymnastics Rhythmic', 'Rhythmic Gymnastics')

# Rassembler Artistic Gymnastics et Gymnastics Artistic dans une même colonne
df.loc[:,'discipline_title'] = df['discipline_title'].replace('Artistic Gymnastics', 'Gymnastics Artistic')

In [13]:
df[df['discipline_title'] == 'Artistic Gymnastics'] # 0 left
df[df['discipline_title'] == 'Gymnastics Artistic']

df[df['discipline_title'] == 'Gymnastics Rhythmic'] # 0 left
df[df['discipline_title'] == 'Rhythmic Gymnastics']


Unnamed: 0,discipline_title,event_title,event_gender,medal_type,participant_type,country_3_letter_code,year
1519,Rhythmic Gymnastics,Individual All-Around,Women,SILVER,Athlete,ROC,2020
1520,Rhythmic Gymnastics,Individual All-Around,Women,BRONZE,Athlete,BLR,2020
1521,Rhythmic Gymnastics,Individual All-Around,Women,GOLD,Athlete,ISR,2020
1522,Rhythmic Gymnastics,Group All-Around,Women,GOLD,GameTeam,BUL,2020
1523,Rhythmic Gymnastics,Group All-Around,Women,SILVER,GameTeam,ROC,2020
1524,Rhythmic Gymnastics,Group All-Around,Women,BRONZE,GameTeam,ITA,2020
1898,Rhythmic Gymnastics,Group All-Around women,Women,GOLD,GameTeam,RUS,2016
1899,Rhythmic Gymnastics,Group All-Around women,Women,SILVER,GameTeam,ESP,2016
1900,Rhythmic Gymnastics,Group All-Around women,Women,BRONZE,GameTeam,BUL,2016
1901,Rhythmic Gymnastics,Individual All-Around women,Women,GOLD,Athlete,RUS,2016


In [14]:
df.isnull().sum()

discipline_title         0
event_title              0
event_gender             0
medal_type               0
participant_type         0
country_3_letter_code    0
year                     0
dtype: int64

In [18]:
df['participant_type'].unique()
# df['event_title'].unique()

array(['GameTeam', 'Athlete'], dtype=object)

In [16]:
# Remplacer les valeurs qualitative par des valeurs numériques

# columns_to_encode = ['discipline_title', 'event_title', 'event_gender', 'medal_type', 'participant_type']


# df.loc[:,'discipline_title'] = pd.factorize(df['discipline_title'])[0]
# df.loc[:,'event_title'] = pd.factorize(df['event_title'])[0]
# df.loc[:,'event_gender'] = pd.factorize(df['event_gender'])[0]

# # df.loc[:,'medal_type'] = pd.factorize(df['medal_type'])[0]
# mapping = {'BRONZE': 0, 'SILVER': 1, 'GOLD': 2}
# df.loc[:,'medal_type'] = df['medal_type'].map(mapping)


# df.loc[:,'participant_type'] = pd.factorize(df['participant_type'])[0]

df.rename(columns={'country_3_letter_code': 'country_code'}, inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'country_3_letter_code': 'country_code'}, inplace=True)


Unnamed: 0,discipline_title,event_title,event_gender,medal_type,participant_type,country_code,year
129,Figure skating,Team Event,Mixed,GOLD,GameTeam,ROC,2022
130,Figure skating,Team Event,Mixed,GOLD,GameTeam,ROC,2022
131,Figure skating,Team Event,Mixed,SILVER,GameTeam,USA,2022
132,Figure skating,Team Event,Mixed,BRONZE,GameTeam,JPN,2022
133,Figure skating,Pair Skating,Mixed,GOLD,GameTeam,CHN,2022
...,...,...,...,...,...,...,...
8183,Taekwondo,57 - 67 kg women,Women,SILVER,Athlete,NOR,2000
8184,Taekwondo,57 - 67 kg women,Women,BRONZE,Athlete,JPN,2000
8185,Taekwondo,49 kg women,Women,GOLD,Athlete,AUS,2000
8186,Taekwondo,49 kg women,Women,SILVER,Athlete,CUB,2000


In [17]:
# Enregistrer le dataframe dans un nouveau fichier csv
df.to_csv('filtered_data.csv', index=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6341 entries, 129 to 8187
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   discipline_title  6341 non-null   object
 1   event_title       6341 non-null   object
 2   event_gender      6341 non-null   object
 3   medal_type        6341 non-null   object
 4   participant_type  6341 non-null   object
 5   country_code      6341 non-null   object
 6   year              6341 non-null   int32 
dtypes: int32(1), object(6)
memory usage: 371.5+ KB
