In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px
import hashlib as hl

In [2]:
df = pd.read_csv("../group_project_OS_canada/Data/athlete_events.csv")
df.head(2)

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,


In [10]:
# fitering out rows for Canada
filt = (df["NOC"] == "CAN")
df_canada = df.loc[filt]
df_canada.head(2)

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
269,140,"William ""Bill"" Abbott Jr.",M,42.0,172.0,80.0,Canada,CAN,1996 Summer,1996,Summer,Atlanta,Sailing,Sailing Mixed Three Person Keelboat,
270,140,"William ""Bill"" Abbott Jr.",M,46.0,172.0,80.0,Canada,CAN,2000 Summer,2000,Summer,Sydney,Sailing,Sailing Mixed Three Person Keelboat,


In [11]:
# Anonymize the column with the names of the athletes for Canada with hash-256
# apply() applies function on each value in the series.

hashed_names = df_canada["Name"].apply(lambda x: hl.sha256(x.encode()).hexdigest())
df_canada["Hashed_names"] = hashed_names
df_canada = df_canada.drop(columns=["Name"])
df_canada.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_canada["Hashed_names"] = hashed_names


Unnamed: 0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,Hashed_names
269,140,M,42.0,172.0,80.0,Canada,CAN,1996 Summer,1996,Summer,Atlanta,Sailing,Sailing Mixed Three Person Keelboat,,aba7d8d0992f2b3b6e46b1831df764994ff8ffbf6c898f...
270,140,M,46.0,172.0,80.0,Canada,CAN,2000 Summer,2000,Summer,Sydney,Sailing,Sailing Mixed Three Person Keelboat,,aba7d8d0992f2b3b6e46b1831df764994ff8ffbf6c898f...
279,146,M,19.0,179.0,71.0,Canada,CAN,1976 Summer,1976,Summer,Montreal,Canoeing,"Canoeing Men's Canadian Doubles, 1,000 metres",,70df8f4a81f7f3a99f3082f342cbc5abe1a94639746d6e...
280,147,F,41.0,160.0,57.0,Canada,CAN,1996 Summer,1996,Summer,Atlanta,Sailing,Sailing Mixed Three Person Keelboat,,f88877baecbe8c235efe8ad5692bab36d2dc9e36091da0...
281,148,F,21.0,164.0,63.0,Canada,CAN,2008 Summer,2008,Summer,Beijing,Sailing,Sailing Women's Three Person Keelboat,,6dbe76474f9fa17f0569120a314c2fb177ec51f05996eb...


Set 1: Canadas 10 best sports

In [12]:
# removing multiple medals for team sports
df_canada_medals = df_canada.drop_duplicates(subset=["Year", "Event", "Medal"])
df_canada_medals.head(2)

Unnamed: 0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,Hashed_names
269,140,M,42.0,172.0,80.0,Canada,CAN,1996 Summer,1996,Summer,Atlanta,Sailing,Sailing Mixed Three Person Keelboat,,aba7d8d0992f2b3b6e46b1831df764994ff8ffbf6c898f...
270,140,M,46.0,172.0,80.0,Canada,CAN,2000 Summer,2000,Summer,Sydney,Sailing,Sailing Mixed Three Person Keelboat,,aba7d8d0992f2b3b6e46b1831df764994ff8ffbf6c898f...


In [13]:
df_canada_sports_medals = df_canada_medals.groupby("Sport").count().sort_values(by="Medal", ascending=False).head(10)

In [14]:
fig = px.bar(
    df_canada_sports_medals,
    x=df_canada_sports_medals.index,
    y=["Medal"],
    title="Canada 10 top sports",
    labels={"value": "Total number of medals"}
)

fig.update_layout(showlegend=False)

fig.show()

Set 2: Number of canadian medals per OS

In [16]:
df_canada_year_medals = df_canada_medals.groupby("Year").count().sort_values(by="Medal", ascending=False)
df_canada_year_medals.head(2)

Unnamed: 0_level_0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Season,City,Sport,Event,Medal,Hashed_names
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1984,241,241,240,235,234,241,241,241,241,241,241,241,48,241
2010,101,101,101,101,101,101,101,101,101,101,101,101,26,101


In [17]:
fig = px.bar(
    df_canada_year_medals,
    x=df_canada_year_medals.index,
    y=["Medal"],
    title="Canadian medals per olympics",
    labels={"value": "Total number of medals"}
)

fig.update_layout(showlegend=False)

fig.show()

Set 3: Age distribution

In [23]:
fig = px.histogram(
    df_canada,
    x="Age",
    nbins=80,
    title="Age distribution canadian athletes",
    labels={"count": "Number of athletes"}
)
fig.update_layout(yaxis_title="Number of athlets")
fig.show()

Set 4: Gender distribution

In [19]:
# remove multiple entries for same athelet
df_canada_sex = df_canada.drop_duplicates(subset="ID")
df_canada_sex.head(2)

Unnamed: 0,ID,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,Hashed_names
269,140,M,42.0,172.0,80.0,Canada,CAN,1996 Summer,1996,Summer,Atlanta,Sailing,Sailing Mixed Three Person Keelboat,,aba7d8d0992f2b3b6e46b1831df764994ff8ffbf6c898f...
279,146,M,19.0,179.0,71.0,Canada,CAN,1976 Summer,1976,Summer,Montreal,Canoeing,"Canoeing Men's Canadian Doubles, 1,000 metres",,70df8f4a81f7f3a99f3082f342cbc5abe1a94639746d6e...


In [20]:
df_canada_sex_distr = df_canada_sex.groupby("Sex").count()

In [21]:
fig= px.pie(values=df_canada_sex_distr["ID"], names=df_canada_sex_distr.index, title="Gender distribution of canadian athleats")

fig.update_traces(hovertemplate = "Sex: %{label}: <br>Number of participants: %{value}")

fig.show()