<h1 style="text-align:center">Olympics Dataset: A plotly Demonstration </h1>
<br>
<br>
<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/5/5c/Olympic_rings_without_rims.svg/1920px-Olympic_rings_without_rims.svg.png" alt="drawing" width="600" style="margin:auto"/>

### This notebook was made, in part to train on the plotly library, and in part to showcase the same. I hope it can be useful for someone.

In [None]:
!pip install chart_studio > /dev/null
import numpy as np
import pandas as pd
import chart_studio.plotly as py
import plotly.graph_objs as go
import wordcloud
import plotly_express as px
from matplotlib import cm
from pprint import pprint
import plotly.io as pio
pio.templates.default = "plotly_dark"
pd.options.plotting.backend = "plotly"

In [None]:
df = pd.read_csv('../input/olympics-althlete-events-analysis/athlete_events.csv')

In [None]:
df.head()

## Question 1: How many players were there in each sport?

In [None]:
#How many unique players are there in each sport?
sport_players = df.groupby('Sport').nunique('Name')['Name']
wc_count = sport_players.to_dict()
pprint(wc_count)

In [None]:
top_25_sports = sport_players.sort_values(ascending=False).iloc[:25]
color_scale = np.log10(1+top_25_sports)
fig = top_25_sports.plot(kind='bar', color=color_scale, labels={'value':'Nº Unique Athletes'}, title="Top 25 Sports by Number of Athletes")
fig.update(layout_coloraxis_showscale=False)

In [None]:
wc = wordcloud.WordCloud(width=1000, height=500,colormap='Dark2', background_color='white', min_font_size=10).generate_from_frequencies(wc_count)

In [None]:
arr = wc.to_array()

def show_image(arr):
    fig = px.imshow(arr)
    #layout = go.Layout(plot_bgcolor='rgba(0,0,0,0)')
    fig.update_layout(dict(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)'), autosize=False, width=1000, height=500, margin=dict(l=0,r=0,b=0,t=0))
    fig.update_xaxes(showline=False,showticklabels=False)
    fig.update_yaxes(showline=False,showticklabels=False)
    fig.show()
show_image(arr)

## Question 2: Who are the 25 players with the most amount of medals?

In [None]:
medal_df = df.loc[:,['Name','Medal']]
pivot_table = medal_df.pivot_table(index='Name', columns='Medal', aggfunc='size', fill_value=0)
pivot_table['Total'] = pivot_table.sum(axis=1)

In [None]:
pivot_table.sort_values(by='Total', inplace=True, ascending=False)
pivot_table

In [None]:
pivot_table.iloc[:25].drop('Total',axis=1).plot(kind='bar',y=['Bronze','Silver','Gold'])

## Question 3: What is the distribution of ages between the competitors 

In [None]:
age_df = df.loc[:,['Name','Age','Sex','Sport']]

In [None]:
fig = go.Figure()

trace_all = go.Histogram(x=age_df.Age,name='all', showlegend=True, marker_color="firebrick")
fig.add_trace(trace_all)
trace_male = go.Histogram(x=age_df.loc[age_df['Sex']=='M'].Age,name="male", visible=False)
fig.add_trace(trace_male)
trace_female = go.Histogram(x=age_df.loc[age_df['Sex']=='F'].Age,name="female", visible=False)
fig.add_trace(trace_female)

buttons = [
    dict(
        label="All",
        method="update",
        args=[
            {'visible':[True,False,False], 'marker.color':'firebrick'}, {'title':'All','showlegend':True}
        ]
    ),
    dict(
        label="Male",
        method="update",
        args=[
            {'visible':[False,True,False], 'marker.color':'steelblue'}, {'title':'Male','showlegend':True}
        ]
    ),
    dict(
        label="Female",
        method="update",
        args=[
            {'visible':[False,False,True],'marker.color':'orchid'}, {'title':'Female','showlegend':True }
        ]
    )
]

fig.update_layout(
    title="All",
    template='plotly_dark',
    updatemenus=[
        go.layout.Updatemenu(
            active=0,
            buttons=buttons
        )
    ]
)



In [None]:
age_df.loc[age_df['Age']>60,'Sport'].value_counts().plot(kind='bar', labels={'index':'Sport','value':'Count'}, title="Number of athletes above age 60 for each sport")

## Question 4: Distribution of Heights on some sports

In [None]:
def plot_distribution_filters(df, dist_column, filter_column, filter_values=None, n_bins=20):
    #getting distinct filter column values
    df = df[~df[dist_column].isna()]
    if not filter_values:
        filter_values =df[filter_column].value_counts()[df[filter_column].value_counts()>1000].sort_index().index.tolist()
    
    fig = go.Figure()
    #---Adding Traces---#
    traces = [go.Histogram(x=df[dist_column], showlegend=True, nbinsx=n_bins, name='All')]
    traces.extend([go.Histogram(x=df.loc[df[filter_column]==filter_value,dist_column], visible=False, nbinsx=n_bins, name=filter_value,showlegend=True) for filter_value in filter_values])
    fig.add_traces(traces)
    #Creating Buttons
    n = len(traces)
    #Visible arguments creation
    visible_list = [[i==j for i in range(n)] for j in range(n)]
    
    #All Values
    filter_values.insert(0,'All')
    buttons = [
        dict(
            label=filter_value,
            method='update',
            args=[{'visible': visible_list[i]}]
        )
        for i,filter_value in enumerate(filter_values)
    ]
    fig.update_layout(
        title='Height Distribution for Sports',
        updatemenus = [
            go.layout.Updatemenu(
                active=0,
                buttons=buttons
            )
        ]
    )
    fig.show()
    
plot_distribution_filters(df,'Height','Sport')
    
    
    

## Question 5: Distribution of Weights on Sports

In [None]:
plot_distribution_filters(df,'Weight','Sport')

# Question 6: Distribution age sports

In [None]:
plot_distribution_filters(df,'Age','Sport')

## Question 7: Correlate Athelete Features and Wins on Different Sports 

In [None]:
from scipy.stats import pointbiserialr

def athelete_feature_win_correlation(df, feature, sport):
    df = df.copy()
    df = df[df['Sport']==sport]
    df['won_medal'] = df['Medal'].isna().map({True:0,False:1})
    df_corr = df.loc[:,['won_medal',feature]]
    df_corr.dropna(inplace=True)
    result=pointbiserialr(df_corr['won_medal'].values,df_corr[feature].values)
    return result
    

In [None]:
athelete_feature_win_correlation(df,'Height','Basketball')

In [None]:
def hist_feature_win(df,feature,sport):
    df =df.copy()
    df = df[df['Sport']==sport]
    df['won_medal'] = df['Medal'].isna().map({True:0,False:1})
    won = df[df['won_medal']==1]
    lost = df[df['won_medal']==0]
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=won[feature], histnorm='probability', name="Games Won", nbinsx=30))
    fig.add_trace(go.Histogram(x=lost[feature], histnorm='probability', name="Games Lost", nbinsx=30))
    fig.update_layout(
        title=f'Distributions of {feature} for Winning and Losing Players',
        xaxis_title=f'{feature}',
        yaxis_title='Probability'
    )
    fig.show()


In [None]:
hist_feature_win(df,'Weight','Basketball')