In [73]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px

In [74]:
#reading in main DataFrame
df = pd.read_csv("../group_project_OS_canada/Data/athlete_events.csv")
df.head(2)

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,


In [75]:
# choosing sport
# Our analysis was done on Athletics and Swimming which were largers sports for Canada
sport_chosen = "Swimming"

Set 1: Number of medals per year in chosen sport

In [76]:
# filtered dataframe on chosen sport
# duplicates dropped to remove rows where team effort generating 1 medal is indicated for each teammember
# filtered dataframe groupd by Year and caluclated are number of medals and average age for each year-group
dff = df[df["Sport"] == sport_chosen].drop_duplicates(subset=["Year", "Event", "Medal"]).groupby("Year").agg({"Medal":"count", "Age":"mean"})
dff.head(2)

Unnamed: 0_level_0,Medal,Age
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1896,10,18.75
1900,21,22.130435


In [77]:
fig = px.bar(dff, x=dff.index, y=["Medal"], title=f"Total number of medals in {sport_chosen.lower()} per year", labels={"value": "Number of medals"})
fig.update_layout(showlegend=False) #legend is not shown
fig.update_traces(hovertemplate = "Year: %{label}: <br>Number of medals: %{value}") #updates what is shown when hovering over fig

Set 2: Average age per year among athletes in chosen sport

In [78]:
# filtered dataframe on chosen sport
# filtered dataframe groupd by Year and caluclated are number of medals and average age for each year-group
dff = df[df["Sport"] == sport_chosen].groupby("Year").agg({"Medal":"count", "Age":"mean"})
dff.head(2)

Unnamed: 0_level_0,Medal,Age
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1896,10,18.714286
1900,31,21.772727


In [79]:
fig = px.bar(dff, x=dff.index, y=["Age"], title=f"Average age in {sport_chosen.lower()} per year", labels={"value": "Average age"})
fig.update_layout(showlegend=False, yaxis_range = [15,30])
fig.update_traces(hovertemplate = "Year: %{label}: <br>Average age: %{value}")

fig.show()

Set 3: Realtive number of athlets in chosen sport to all athletes per year

In [80]:
# filtered dataframe on chosen sport
# dataframe for chosen sport created, duplicates removed if same person listed more than once for same year...
#... grouped by year and values counted where ID column count gives number of participants, ID column renamed to prepare for concatination
# same as above but for df for all sports
# dfs are concatinated, axis 1 means that new columns will be added matching index which is year
# new column is added with result from calculations

dff = df[df["Sport"] == sport_chosen]

dff_chosen_sport = dff.drop_duplicates(subset=["ID", "Year"]).groupby("Year").count().rename(columns={"ID":"Number in sport"})
dff_all_sports = df.drop_duplicates(subset=["ID", "Year"]).groupby("Year").count().rename(columns={"ID":"Number tot"})
dff = pd.concat([dff_chosen_sport, dff_all_sports], axis = 1)
dff["Rel athletes in sport"] = 100 * dff["Number in sport"]/dff["Number tot"]
dff.head(2)

Unnamed: 0_level_0,Number in sport,Name,Sex,Age,Height,Weight,Team,NOC,Games,Season,...,Weight,Team,NOC,Games,Season,City,Sport,Event,Medal,Rel athletes in sport
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1896,13.0,13.0,13.0,4.0,0.0,0.0,13.0,13.0,13.0,13.0,...,14,176,176,176,176,176,176,176,63,7.386364
1900,76.0,76.0,76.0,33.0,4.0,2.0,76.0,76.0,76.0,76.0,...,40,1224,1224,1224,1224,1224,1224,1224,362,6.20915


In [81]:
fig = px.scatter(
    dff,
    x=dff.index,
    y=["Rel athletes in sport"],
    title=f"Number of athletes in {sport_chosen.lower()} relative to all athletes",
    labels={"value": "Percentage"}
)

fig.update_layout(showlegend=False)
fig.update_traces(hovertemplate = "Year: %{label}: <br>Percentage: %{value}")

fig.show()

Set 4 Age distribution for athletes in chosen sport

In [82]:
dff = df[df["Sport"] == sport_chosen]

fig = px.histogram(dff,x="Age",nbins=80, title= f"Age distribution in {sport_chosen.lower()}", labels={"count": "Number of athletes"})
fig.update_layout(yaxis_title= "Number of athletes")
fig.show()