# Préparation des données pour Projet Final JO - Inégalités H/F

Pour ce projet, je m'occupe de la partie sur l'étude des inégalités Hommes/Femmes aux Jeux Olympiques.
Pour cela, j'utilise deux tables :
- Les tables "athletes_events.csv" et "noc_regions.csv" du jeu de données "120 years of Olympic history", disponible sur Kaggle :  
    https://www.kaggle.com/datasets/heesoo37/120-years-of-olympic-history-athletes-and-results?resource=download
- La table "gender-inequality-index-from-the-human-development-report.csv", disponible sur Our World in Data :  
    https://ourworldindata.org/grapher/gender-inequality-index-from-the-human-development-report

### Importation et nettoyage des fichiers

1. Fichier Athlètes JO

In [131]:
# Importation de la bibliothèque Pandas
import pandas as pd

# Importer les données dans un dataframe
olympic_games = pd.read_csv("./athlete_events.csv", sep = ",")

# Ne garder que les jeux d'été
olympic_games = olympic_games[olympic_games['Season'] == 'Summer']

# Ne garder que les colonnes utiles pour l'étude
olympic_games = olympic_games[['Sex', 'Team', 'NOC', 'Year', 'Sport', 'Medal']]

# Création d'ID (concaténation du NOC et année) pour lier à la table des GII
olympic_games['ID'] = olympic_games['NOC'] + olympic_games['Year'].astype(str)

olympic_games.head()

Unnamed: 0,Sex,Team,NOC,Year,Sport,Medal,ID
0,M,China,CHN,1992,Basketball,,CHN1992
1,M,China,CHN,2012,Judo,,CHN2012
2,M,Denmark,DEN,1920,Football,,DEN1920
3,M,Denmark/Sweden,DEN,1900,Tug-Of-War,Gold,DEN1900
26,F,Netherlands,NED,1932,Athletics,,NED1932


2. Fichier NOC/Régions

In [132]:
# Importer les données dans un dataframe
noc_regions = pd.read_csv("./noc_regions.csv", sep = ",")

# Trouver les valeurs manquantes dans les colonnes NOC et Region
missing_noc_region = noc_regions[noc_regions[['NOC', 'Region']].isnull().any(axis=1)]
print(missing_noc_region)

# Remplacer les valeurs manquantes dans Region par les valeurs de Notes
noc_regions['Region'].fillna(noc_regions['Notes'], inplace=True)

# Supprimer la colonne Notes
noc_regions = noc_regions.drop('Notes', axis=1)

     NOC Region                 Notes
168  ROT    NaN  Refugee Olympic Team
208  TUV    NaN                Tuvalu
213  UNK    NaN               Unknown


In [133]:
# En visualisation rapide sur Tableau (Map), on voit que la Bolivie n'apparait pas
bolivia_region = noc_regions[noc_regions['NOC'] == 'BOL']
if bolivia_region.empty:
    print("No region information available for NOC code 'BOL'")
else:
    print(bolivia_region)

# Il y a une erreur sur le nom "Boliva" au lieu de "Bolivia"
if not bolivia_region.empty:
    noc_regions.loc[noc_regions['NOC'] == 'BOL', 'Region'] = 'Bolivia'

    NOC  Region
27  BOL  Boliva


In [134]:
# On voit sur Data Wrangler qu'il y a plus de codes NOC que de regions
multiple_noc_regions = noc_regions.groupby('Region')['NOC'].unique().reset_index()
multiple_noc_regions = multiple_noc_regions[multiple_noc_regions['NOC'].str.len() > 1]

print(multiple_noc_regions)

             Region                   NOC
10        Australia            [ANZ, AUS]
33           Canada            [CAN, NFL]
39            China            [CHN, HKG]
48   Czech Republic       [BOH, CZE, TCH]
67          Germany  [FRG, GDR, GER, SAA]
69           Greece            [CRT, GRE]
110        Malaysia       [MAL, MAS, NBO]
152          Russia       [EUN, RUS, URS]
162          Serbia       [SCG, SRB, YUG]
180           Syria            [SYR, UAR]
188        Trinidad            [TTO, WIF]
203         Vietnam            [VIE, VNM]
206           Yemen       [YAR, YEM, YMD]
208        Zimbabwe            [RHO, ZIM]


In [140]:
# On enregistre le dataframe nettoyé à utiliser dans Tableau
noc_regions.to_csv('noc_regions_clean.csv', index=False)

noc_regions.head()

Unnamed: 0,NOC,Region
0,AFG,Afghanistan
1,AHO,Curacao
2,ALB,Albania
3,ALG,Algeria
4,AND,Andorra


3. Fichier GII index

In [136]:
# Importer les données dans un dataframe
gii_country = pd.read_csv("./gender-inequality-index-from-the-human-development-report.csv", sep = ",")

# Enlever les lignes sans Code
gii_country.dropna(subset=['Code'], inplace=True)

# Renommer la colonne des index
gii_country.rename(columns={'Gender Inequality Index': 'GII'}, inplace=True)

# Création de l'ID (concaténation du NOC et année)
gii_country['ID'] = gii_country['Code'] + gii_country['Year'].astype(str)

gii_country.head()

Unnamed: 0,Entity,Code,Year,GII,ID
0,Afghanistan,AFG,2005,0.748,AFG2005
1,Afghanistan,AFG,2006,0.749,AFG2006
2,Afghanistan,AFG,2007,0.752,AFG2007
3,Afghanistan,AFG,2008,0.755,AFG2008
4,Afghanistan,AFG,2009,0.755,AFG2009


### Préparation des données

#### 1. Table des médailles par sport/sexe

Ajout d'une colonne "Type" pour différentier les sports collectifs/individuels

In [142]:
# Faire la liste des sports collectifs
team_sports_list = ['Basketball', 'Football', 'Tug-Of-War', 'Handball', 'Water Polo', 'Hockey', 'Volleyball', 'Synchronized Swimming', 'Baseball', 'Rhythmic Gymnastics', 'Rugby Sevens', 'Beach Volleyball', 'Rugby', 'Lacrosse', 'Polo', 'Cricket', 'Ice Hockey']

# Créer une colonne "Type" avec valeurs par défaut "Individual" 
olympic_games['Type'] = 'Individual'

# Change le type à "Team" pour les sports inclus dans l'array
olympic_games.loc[olympic_games['Sport'].isin(team_sports_list), 'Type'] = 'Team'

olympic_games


Unnamed: 0,Sex,Team,NOC,Year,Sport,Medal,ID,Type
0,M,China,CHN,1992,Basketball,,CHN1992,Team
1,M,China,CHN,2012,Judo,,CHN2012,Individual
2,M,Denmark,DEN,1920,Football,,DEN1920,Team
3,M,Denmark/Sweden,DEN,1900,Tug-Of-War,Gold,DEN1900,Team
26,F,Netherlands,NED,1932,Athletics,,NED1932,Individual
...,...,...,...,...,...,...,...,...
271106,M,Argentina,ARG,2004,Hockey,,ARG2004,Team
271107,M,United States,USA,1972,Football,,USA1972,Team
271108,M,Russia,RUS,2000,Rowing,,RUS2000,Individual
271109,M,Russia,RUS,2004,Rowing,,RUS2004,Individual


In [147]:
# Group by ID, Sex, and Sport and count the medals
medals_per_group = olympic_games.groupby(['Year', 'NOC', 'Sex', 'Sport', 'Medal']).size().unstack(fill_value=0)

# Rename the columns
medals_per_group.columns = ['Bronze', 'Gold', 'Silver']

# Reset the index
medals_per_group = medals_per_group.reset_index()

# Display the rearranged dataframe
print(medals_per_group)


      Year  NOC Sex         Sport  Bronze  Gold  Silver
0     1896  AUS   M     Athletics       0     2       0
1     1896  AUS   M        Tennis       1     0       0
2     1896  AUT   M       Cycling       2     1       0
3     1896  AUT   M      Swimming       0     1       1
4     1896  DEN   M       Fencing       1     0       0
...    ...  ...  ..           ...     ...   ...     ...
6947  2016  BRN   F     Athletics       0     1       1
6948  2016  GRN   M     Athletics       0     0       1
6949  2016  FIJ   M  Rugby Sevens       0    13       0
6950  2016  JOR   M     Taekwondo       0     1       0
6951  2016  KOS   F          Judo       0     1       0

[6952 rows x 7 columns]


2. Ajout d'une colonne % d'athlètes femmes dans la table GII

In [138]:
# Group by ID and count occurrences of each ID
grouped_data = olympic_games.groupby('ID').size().reset_index(name='Total')

# Calculate the count of women for each ID
women_count = olympic_games[olympic_games['Sex'] == 'F'].groupby('ID').size().reset_index(name='Female_Count')

# Calculate the percentage of women for each ID
grouped_data = grouped_data.merge(women_count, on='ID', how='left')
grouped_data['Percentage_Female'] = (grouped_data['Female_Count'] / grouped_data['Total']) * 100

# Merge with gii_country dataframe based on ID
gii_country = gii_country.merge(grouped_data[['ID', 'Percentage_Female']], on='ID', how='left')

gii_country.head()

Unnamed: 0,Entity,Code,Year,GII,ID,Percentage_Female
0,Afghanistan,AFG,2005,0.748,AFG2005,
1,Afghanistan,AFG,2006,0.749,AFG2006,
2,Afghanistan,AFG,2007,0.752,AFG2007,
3,Afghanistan,AFG,2008,0.755,AFG2008,25.0
4,Afghanistan,AFG,2009,0.755,AFG2009,


#### 2. Ajout de colonnes nombre de médailles dans la table GII

In [139]:
# Calculate the total number of medals per ID
total_medals = olympic_games.groupby('ID')['Medal'].count().reset_index(name='Total_Medals')

# Calculate the number of medals earned by female athletes per ID
women_medals = olympic_games[olympic_games['Sex'] == 'F'].groupby('ID')['Medal'].count().reset_index(name='Female_Medals')

# Merge all the calculated data with gii_country dataframe based on ID
gii_country = gii_country.merge(total_medals, on='ID', how='left')
gii_country = gii_country.merge(women_medals, on='ID', how='left')

# Remove rows with missing data
gii_country.dropna(inplace=True)

# Convert medals count to integers
gii_country['Total_Medals'] = gii_country['Total_Medals'].astype(int)
gii_country['Female_Medals'] = gii_country['Female_Medals'].astype(int)

gii_country.to_csv('gii_country.csv', index=False)

gii_country.head()


Unnamed: 0,Entity,Code,Year,GII,ID,Percentage_Female,Total_Medals,Female_Medals
3,Afghanistan,AFG,2008,0.755,AFG2008,25.0,1,0
7,Afghanistan,AFG,2012,0.738,AFG2012,16.666667,1,0
11,Afghanistan,AFG,2016,0.692,AFG2016,33.333333,0,0
18,Albania,ALB,2000,0.319,ALB2000,60.0,0,0
22,Albania,ALB,2004,0.301,ALB2004,28.571429,0,0
