# Data preparation - Identifier les pays dominants
---

In [1]:
import pandas as pd
df_jo = pd.read_csv("athlete_events.csv", sep = ",")
df_NOC = pd.read_csv("noc_regions.csv", sep = ",")
df_jo.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


### Performances par pays, toutes éditions confondues

In [2]:
#liaison avec df NOC car les nations du athletes.csv peuvent être fausses
df = pd.merge(df_jo, df_NOC, on='NOC', how='left')
df = df.rename(columns={'region':'Nation'})

# Données des jeux d'été uniquement
df = df.loc[df['Season'] == "Summer"]

# Sélection des colonnes à garder
df= df.loc[:, ["Nation", "NOC", "Year", "Sport", "Event", "Medal"]]

# Suppression des valeurs nulles : athlètes n'ayant pas eu de médaille
df = df.dropna()


#On conserve une seule médaille par event
df = df.drop_duplicates(subset=["Event", "Medal", "Year"])

df_medals = df
df_gold = df.loc[df['Medal'] == "Gold"]
df_medals

Unnamed: 0,Nation,NOC,Year,Sport,Event,Medal
3,Denmark,DEN,1900,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
37,Finland,FIN,1920,Swimming,Swimming Men's 200 metres Breaststroke,Bronze
38,Finland,FIN,1920,Swimming,Swimming Men's 400 metres Breaststroke,Bronze
41,Finland,FIN,1948,Gymnastics,Gymnastics Men's Individual All-Around,Bronze
42,Finland,FIN,1948,Gymnastics,Gymnastics Men's Team All-Around,Gold
...,...,...,...,...,...,...
270986,Russia,RUS,2008,Tennis,Tennis Women's Singles,Bronze
271010,Austria,AUT,1924,Weightlifting,Weightlifting Men's Lightweight,Silver
271076,Russia,URS,1952,Athletics,Athletics Women's Shot Put,Gold
271078,Russia,URS,1956,Athletics,Athletics Women's Shot Put,Silver


In [3]:
#calcul d'une table donnant le nombre de médailles obtenues par pays par édition
df = df_medals
df = df.groupby(["Year","Nation"])["Medal"].count()
df1 = df.reset_index().rename(columns={'Medal':'Total medals'})

#calcul du nombre d'événements par année 
df = df_gold
df = df.groupby("Year")["Medal"].count()
df2 = df.reset_index().rename(columns={'Medal':'Event number'})

#fusion des df et ajout de la colonne indicateur
df = pd.merge(df1, df2, on='Year', how='left')
df['Perf indicator'] = round(df['Total medals']/(3*df['Event number']),2)
df_perf = df
df_perf

Unnamed: 0,Year,Nation,Total medals,Event number,Perf indicator
0,1896,Australia,3,43,0.02
1,1896,Austria,5,43,0.04
2,1896,Denmark,6,43,0.05
3,1896,France,11,43,0.09
4,1896,Germany,13,43,0.10
...,...,...,...,...,...
1227,2016,USA,119,305,0.13
1228,2016,Ukraine,11,305,0.01
1229,2016,Uzbekistan,8,305,0.01
1230,2016,Venezuela,3,305,0.00


In [4]:
#calcul d'une table donnant le nombre de médailles d'or obtenues par pays par édition
df = df_gold
df = df.groupby(["Year","Nation"])["Medal"].count()
df1 = df.reset_index().rename(columns={'Medal':'Total gold medals'})

#calcul du nombre d'événements par année 
df = df_gold
df = df.groupby("Year")["Medal"].count()
df2 = df.reset_index().rename(columns={'Medal':'Event number'})

#fusion des df et ajout de la colonne indicateur
df = pd.merge(df1, df2, on='Year', how='left')
df['Gold perf indicator'] = round(df['Total gold medals']/df['Event number'],2)
df.pop("Event number")
df_gold_perf = df 

#fusion des 2 df de performance
df = pd.merge(df_perf, df_gold_perf, on=['Year',"Nation"], how='left')
#passage en int tout en ignorant les valeurs NaN
df["Total gold medals"]= pd.to_numeric(df["Total gold medals"], errors='coerce').astype(pd.Int64Dtype())

#Ajout de périodes temporelles - 1981 : année où la contrainte de l'amateurisme disparait pour l'ensemble des disciplines (sauf le football)
bins= [1896,1981,2022]
labels = ["Before 1981", "After 1981"]
df['Period'] = pd.cut(df["Year"], bins=bins, labels=labels, right=False)

df_final = df
df_final

Unnamed: 0,Year,Nation,Total medals,Event number,Perf indicator,Total gold medals,Gold perf indicator,Period
0,1896,Australia,3,43,0.02,2,0.05,Before 1981
1,1896,Austria,5,43,0.04,2,0.05,Before 1981
2,1896,Denmark,6,43,0.05,1,0.02,Before 1981
3,1896,France,11,43,0.09,5,0.12,Before 1981
4,1896,Germany,13,43,0.10,6,0.14,Before 1981
...,...,...,...,...,...,...,...,...
1227,2016,USA,119,305,0.13,46,0.15,After 1981
1228,2016,Ukraine,11,305,0.01,2,0.01,After 1981
1229,2016,Uzbekistan,8,305,0.01,4,0.01,After 1981
1230,2016,Venezuela,3,305,0.00,,,After 1981


In [5]:
#export pour data visualisation
df_final.to_csv("dominant_countries.csv", index=False)

# Data analyse - Identifier les pays dominants
---

In [6]:
#calcul des nations ayant le plus remporté de médailles possibles par année. Avant et après 1981, car c'est l'année à partir de laquelle la contrainte d'amateurisme a été retirée.
df = df_final

df = round(df.groupby(["Nation","Period"])["Perf indicator"].mean(),2)
df = df.sort_values(ascending=False).reset_index()

display(df.loc[df["Period"]=="Before 1981"])
display(df.loc[df["Period"]=="After 1981"])


Unnamed: 0,Nation,Period,Perf indicator
0,USA,Before 1981,0.21
1,Russia,Before 1981,0.16
3,Germany,Before 1981,0.10
5,UK,Before 1981,0.08
8,France,Before 1981,0.07
...,...,...,...
259,United Arab Emirates,Before 1981,
260,Uzbekistan,Before 1981,
261,Vietnam,Before 1981,
262,"Virgin Islands, US",Before 1981,


Unnamed: 0,Nation,Period,Perf indicator
2,USA,After 1981,0.13
4,Russia,After 1981,0.10
6,Germany,After 1981,0.08
7,China,After 1981,0.07
11,Australia,After 1981,0.04
...,...,...,...
226,Iraq,After 1981,
234,Lebanon,After 1981,
236,Luxembourg,After 1981,
240,Monaco,After 1981,


In [12]:
#mesures de distribution et dispersion
df = df_final

print("Moyenne des parts de médaille : ", round(df['Perf indicator'].mean(),2))
print("Médiane des parts de médaille : ", round(df['Perf indicator'].median(),2))
print("Ecart type des parts de médaille : ", round(df['Perf indicator'].std(),2))
print("MVariance des parts de médaille : ", round(df['Perf indicator'].var(),3))
print("\n")
print("Moyenne des parts de médaille d'or : ", round(df['Gold perf indicator'].mean(),2))
print("Médiane des parts de médaille d'or : ", round(df['Gold perf indicator'].median(),2))
print("Ecart type des parts de médaille d'or : ", round(df['Gold perf indicator'].std(),2))
print("Variance des parts de médaille d'or: ", round(df['Gold perf indicator'].var(),3))

Moyenne des parts de médaille :  0.02
Médiane des parts de médaille :  0.01
Ecart type des parts de médaille :  0.05
MVariance des parts de médaille :  0.002


Moyenne des parts de médaille d'or :  0.03
Médiane des parts de médaille d'or :  0.01
Ecart type des parts de médaille d'or :  0.06
Variance des parts de médaille d'or:  0.004
