# TidyTuesday: Himalaya Data

Let's do some data viz looking at mountain climbing data.

Starting with loading libraries:

In [1]:
# Daten

import numpy as np
import pandas as pd 

#Grafik

import plotly.express as px 
import seaborn as sns
import matplotlib.pyplot as plt

#Einbettung der Grafiken
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

Data is available online in the csv-Format:

In [2]:
#Loading the data

df_peaks = pd.read_csv(r'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/peaks.csv',
                        sep=',',
                        index_col=0,
                        skipinitialspace=True)

df_members = pd.read_csv(r'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/members.csv',
                        sep=',',
                        index_col=0,
                        skipinitialspace=True)

df_expeditions = pd.read_csv(r'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-22/expeditions.csv',
                        sep=',',
                        index_col=0,
                        skipinitialspace=True)

And now let's prep the data for our viz.

In [3]:
#Putting Peakname and Height of Mountain together
df_peaks['peak_name_with_height'] = df_peaks['peak_name'] + ' (' +df_peaks['height_metres'].astype(str) + 'm)'
df_peaks

Unnamed: 0_level_0,peak_name,peak_alternative_name,height_metres,climbing_status,first_ascent_year,first_ascent_country,first_ascent_expedition_id,peak_name_with_height
peak_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AMAD,Ama Dablam,Amai Dablang,6814,Climbed,1961.0,"New Zealand, USA, UK",AMAD61101,Ama Dablam (6814m)
AMPG,Amphu Gyabjen,,5630,Climbed,1953.0,UK,AMPG53101,Amphu Gyabjen (5630m)
ANN1,Annapurna I,,8091,Climbed,1950.0,France,ANN150101,Annapurna I (8091m)
ANN2,Annapurna II,,7937,Climbed,1960.0,"UK, Nepal",ANN260101,Annapurna II (7937m)
ANN3,Annapurna III,,7555,Climbed,1961.0,India,ANN361101,Annapurna III (7555m)
...,...,...,...,...,...,...,...,...
SANK,Sano Kailash,,6452,Climbed,2019.0,"Austria, Nepal",SANK19101,Sano Kailash (6452m)
TARS,Tarke Kang Shar,Tare Kang,7069,Climbed,1981.0,"Italy, Nepal",TARS81301,Tarke Kang Shar (7069m)
KORL,Korlang Pari Tippa,Korlang Pari Tippa North,5738,Unclimbed,,,,Korlang Pari Tippa (5738m)
JANE,Jannu East,Khumbhakarna East,7460,Unclimbed,,,,Jannu East (7460m)


In [21]:
#Merge peaks, climbers and expeditions

df_soloclimbers = pd.merge(
df_peaks, df_members, on = ['peak_name'])

#df_soloclimbers_final = pd.merge(df_soloclimbers, df_expeditions, on = ['peak_name'])


df_soloclimbers_final =  df_soloclimbers.loc[(df_soloclimbers.loc[:,'solo'] == True) & (df_soloclimbers.loc[:,'first_ascent_year'] > 0)]





In [22]:
#Group data for graphics
df_plot_soloclimbers_final = df_soloclimbers_final.groupby(['peak_name_with_height'], as_index= False).agg(number_of_climbs = ('peak_name', 'count'),
                                                                                                   median_year = ('first_ascent_year', 'median'),
                                                                                                   mean_success_rate = ('success', 'mean')).sort_values(by='median_year', ascending= True)

df_plot_soloclimbers_final

Unnamed: 0,peak_name_with_height,number_of_climbs,median_year,mean_success_rate
1,Annapurna I (8091m),4,1950.0,0.25
14,Everest (8850m),8,1953.0,0.375
26,Makalu II (7678m),2,1954.0,1.0
5,Baruntse (7152m),2,1954.0,1.0
10,Cho Oyu (8188m),17,1954.0,0.529412
7,Chago (6893m),1,1954.0,1.0
29,Pethangtse (6739m),1,1954.0,1.0
28,Parchamo (6279m),1,1955.0,1.0
25,Makalu (8485m),8,1955.0,0.375
20,Kang Guru (6981m),1,1955.0,1.0


And produce a plot:

In [24]:
#Producing a graph

fig = px.scatter(df_plot_soloclimbers_final, 
                     x= 'median_year',
                     y = 'peak_name_with_height', 
                     title = "<b>Ain't no mountain high enough!</b> <br>Which Himalayan peaks have been peaking the interest of solo climbers?",
                     color= 'mean_success_rate',
                     color_continuous_scale='RdYlGn',
                     labels={'median_year': 'Year of First Ascend', 
                             'peak_name_with_height':"Peak Name", 
                             "mean_success_rate":"Mean Success Rate", 
                             'number_of_climbs':'Number of Solo Climbs'},
                     size = 'number_of_climbs',
                     size_max= 20,
                     hover_name = 'peak_name_with_height',
                     template = "plotly_dark",
                     width = 800,
                     height = 1000
                     )

fig.update_layout(coloraxis_colorbar=dict(
    thicknessmode="pixels", thickness=40,
    lenmode="pixels", len=200,
    yanchor="middle", 
    ticks = 'outside',
    ticksuffix=" percent",
    dtick=5,
    tickvals=[0, 0.5, 1],
    ticktext=['0', '50', '100']
))

fig.add_annotation(text='Data Source: <a href=”https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-09-22/readme.md”>The Himalaya Database</a> <br>Viz: <a href=”https://twitter.com/_prospecttheory”>@_prospecttheory</a> <br>  ', 
                    align='left',
                    showarrow=False,
                    xref='paper',
                    yref='paper',
                    x=-0.4,
                    y= -0.1,
                    bordercolor='black',
                    borderwidth=1)


fig.show()