# Data preparation - Performance aux J.O. par pays
---

In [2]:
import pandas as pd
#attention à renseigner le bon nom de dossier
df_jo = pd.read_csv("athlete_events.csv", sep = ",")
df_NOC = pd.read_csv("noc_regions.csv", sep = ",")
df_NOC.head()

Unnamed: 0,NOC,region,notes
0,AFG,Afghanistan,
1,AHO,Curacao,Netherlands Antilles
2,ALB,Albania,
3,ALG,Algeria,
4,AND,Andorra,


### Performances par pays, toutes éditions confondues

In [3]:
#liaison avec df NOC car les nations du athletes.csv peuvent être fausses
df = pd.merge(df_jo, df_NOC, on='NOC', how='left')
df = df.rename(columns={'region':'Nation'})

# Données des jeux d'été uniquement
df = df.loc[df['Season'] == "Summer"]

# Sélection des colonnes à garder
df= df.loc[:, ["Nation", "NOC", "Year", "Sport", "Event", "Medal"]]

# Suppression des valeurs nulles : athlètes n'ayant pas eu de médaille
df = df.dropna()


#On conserve une seule médaille par event
df = df.drop_duplicates(subset=["Event", "Medal", "Year"])

df_medals = df
df_gold = df.loc[df['Medal'] == "Gold"]
df_medals

Unnamed: 0,Nation,NOC,Year,Sport,Event,Medal
3,Denmark,DEN,1900,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
37,Finland,FIN,1920,Swimming,Swimming Men's 200 metres Breaststroke,Bronze
38,Finland,FIN,1920,Swimming,Swimming Men's 400 metres Breaststroke,Bronze
41,Finland,FIN,1948,Gymnastics,Gymnastics Men's Individual All-Around,Bronze
42,Finland,FIN,1948,Gymnastics,Gymnastics Men's Team All-Around,Gold
...,...,...,...,...,...,...
270986,Russia,RUS,2008,Tennis,Tennis Women's Singles,Bronze
271010,Austria,AUT,1924,Weightlifting,Weightlifting Men's Lightweight,Silver
271076,Russia,URS,1952,Athletics,Athletics Women's Shot Put,Gold
271078,Russia,URS,1956,Athletics,Athletics Women's Shot Put,Silver


In [4]:
#Pour afficher la liste des sports collectifs : 
df= df_jo
df=df.loc[df['Medal'] == "Gold"]
duplicates = df.duplicated(subset=["Event", "Medal", "Year"], keep = False)
df_duplicates = df[duplicates]
print(df_duplicates["Event"].unique())
# print(len(df["Event"].unique()))

["Tug-Of-War Men's Tug-Of-War" "Gymnastics Men's Team All-Around"
 "Gymnastics Men's Pommelled Horse" "Handball Women's Handball"
 "Speed Skating Men's 1,500 metres" 'Sailing Mixed 8 metres'
 "Cycling Men's Road Race, Team"
 "Canoeing Men's Kayak Fours, 1,000 metres" "Handball Men's Handball"
 "Football Men's Football" "Water Polo Men's Water Polo"
 'Sailing Mixed Two Person Heavyweight Dinghy'
 "Rowing Men's Quadruple Sculls" "Rowing Men's Double Sculls"
 "Rowing Men's Coxed Pairs" "Hockey Men's Hockey"
 "Basketball Men's Basketball" "Nordic Combined Men's Team"
 "Ice Hockey Men's Ice Hockey" "Rowing Men's Coxed Eights"
 "Baseball Men's Baseball" "Athletics Men's 4 x 400 metres Relay"
 "Gymnastics Men's Team All-Around, Free System"
 "Gymnastics Men's Team All-Around, Swedish System" "Bobsleigh Men's Two"
 "Curling Men's Curling" "Cycling Men's Team Pursuit, 4,000 metres"
 "Shooting Men's Military Rifle, 200, 400, 500 and 600 metres, Team"
 "Volleyball Women's Volleyball" "Canoeing Me

In [11]:
#vérifications - on peut voir plusieurs valeurs pour "Team"
df = df_gold

df = df.loc[df['Year'] == 1996]
df = df.loc[df['NOC'] == "ESP"]
display(df)
print(len(df))

Unnamed: 0,Nation,NOC,Year,Sport,Event,Medal
182,Spain,ESP,1996,Water Polo,Water Polo Men's Water Polo,Gold
13309,Spain,ESP,1996,Rhythmic Gymnastics,Rhythmic Gymnastics Women's Group,Gold
13498,Spain,ESP,1996,Sailing,Sailing Mixed Multihull,Gold
102982,Spain,ESP,1996,Cycling,Cycling Men's Individual Time Trial,Gold
252129,Spain,ESP,1996,Sailing,Sailing Women's Two Person Dinghy,Gold


5


In [6]:
#export pour data visualisation
df_medals.to_csv("medals_per_country.csv", index=False)

# Data analyse - Performance aux J.O. par pays
---

In [7]:
#Calcul du nombre d'éditions dans notre data set
df = df_medals
editionsNum = len(df["Year"].unique())
editionsNum

29

In [8]:
#Affichage du top all time
df = df_medals
df = df.groupby("Nation")['Medal'].count().sort_values(ascending=False)
display(df)

#Affichage du top de nombre de médailles d'or all time
df = df_gold
df = df.groupby("Nation")['Medal'].count().sort_values(ascending=False)
display(df)

Nation
USA         2502
Russia      1514
Germany     1343
UK           863
France       740
            ... 
Fiji           1
Eritrea        1
Djibouti       1
Sudan          1
Gabon          1
Name: Medal, Length: 132, dtype: int64

Nation
USA                            1028
Russia                          582
Germany                         438
UK                              275
France                          233
                               ... 
Grenada                           1
Individual Olympic Athletes       1
Israel                            1
Ivory Coast                       1
Mozambique                        1
Name: Medal, Length: 96, dtype: int64

In [9]:
#calcul du nombre d'événements par année 
df = df_gold
df = df.groupby("Year")["Medal"].count()
df

Year
1896     43
1900     89
1904     95
1906     74
1908    109
1912    106
1920    154
1924    129
1928    118
1932    125
1936    140
1948    143
1952    149
1956    151
1960    150
1964    163
1968    172
1972    193
1976    198
1980    203
1984    221
1988    237
1992    257
1996    271
2000    299
2004    301
2008    302
2012    302
2016    305
Name: Medal, dtype: int64