<a href="https://colab.research.google.com/github/Tclack88/MountainProject/blob/master/csv_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!rm * ; rm -r sample_data

rm: cannot remove 'sample_data': Is a directory


In [0]:
!ls

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import requests
from multiprocessing.dummy import Pool
import os
from bs4 import BeautifulSoup

In [0]:
### Supporting Definitions ###

# get user_data links
def get_csv_url(id):
  user = requests.head(f'https://www.mountainproject.com/user/{id}').headers['Location'].split('/')[-1]
  return f'https://www.mountainproject.com/user/{id}/{user}/tick-export'
  


# add danger column
danger_rating = {'PG13':1 ,'R':2,'X':3}
def get_danger(grade):
  try:
    danger = danger_rating[grade.split()[1]]
    return danger
  except:
    return 0 

  
# A possible arrangement of difficulties
# 10a 10- 10a/b 10b 10 10b/c 10c 10+ 10c/d 10d 11a  --> 10.0, 10.1, ... , 10.9, 11

grade_chart = pd.read_html("https://www.mountainproject.com/international-climbing-grades")

old_ropes = grade_chart[0].YDSUSA[:-1].to_list()
new_ropes = [0,0,0,0,1,2,3,4,5,6,7,7.5,8,8.5,8.8,9,9.5,9.8]+list(np.round(np.arange(10,16,.1),1))
ropes_convert = dict(zip(old_ropes,new_ropes))

old_boulders = grade_chart[1].HuecoUSA[:-1].to_list()
new_boulders = [0,0,0,0,0]+list(np.round(np.arange(1,17.5,.25),1))

boulders_convert = dict(zip(old_boulders,new_boulders))

def clean_grade(grade):
  grade = str(grade).split()[0]
  if grade[0] == '5':
    grade = ropes_convert[grade]
    return grade
  elif grade[0] == 'V':
    grade = boulders_convert[grade]
    return grade
  
  

# create user ticks
def create_user_ticks(file):
  ticks = pd.read_csv(file)
  cols = ['Date','Rating','Pitches','Style','Lead Style','Route Type','Location']
  ticks = ticks[cols]

  #rename cols
  old_names = ticks.columns.to_list()
  new_names = ['date','grade','pitches','style','lead_style','type','location']
  rename_cols = dict(zip(old_names,new_names))
  ticks = ticks.rename(columns=rename_cols)

  # date to datetime
  ticks.date = pd.to_datetime(ticks.date)

  # add danger column
  ticks['danger'] = ticks.grade.apply(get_danger)

  
  ticks.grade = ticks.grade.apply(clean_grade)
  return ticks

def add_quarters(ticks,i):
  tick_range = ticks.iloc[fracs[i]:fracs[i+1],1]
  return tick_range.max()

In [0]:
# Main Functions

def create_climber_stats(ticks):
  climber_dict = {}
  climber_dict.update(years_total = [ticks.date.max()-ticks.date.min()],
                     climbs_total = [ticks.shape[0]],
                     pitches_total = [ticks.pitches.sum()],
                     route_mean = [ticks[ticks['type'] != 'Boulder'].grade.mean()],
                     route_max = [ticks[ticks['type'] != 'Boulder'].grade.max()],
                     boulder_mean = [ticks[ticks.type == 'Boulder'].grade.mean()],
                     boulder_max = [ticks[ticks.type == 'Boulder'].grade.max()],
                     danger_factor = [ticks.danger.mean()],
                     solos = [ticks[ticks['style'] == 'Solo'].shape[0]],
                     hardest_solo = [ticks[ticks['style'] == 'Solo'].grade.max()],
                     trad_count = [ticks.type.apply(lambda x: 'Trad' in x).sum()],
                     sport_count = [ticks.type.apply(lambda x: 'Sport' in x).sum()],
                     locations = [ticks.location.nunique()],
                     success = [(ticks.lead_style=='Onsight').sum()])  # needs to be list when entries are a scalar
  
  n =  5
  fracs = (np.linspace(0,ticks.shape[0],n)).astype(int)
  for i in range(n-1):
    name = 'quarter_max'+str(i+1)
    d = { name : add_quarters(ticks,i)}
    climber_dict.update(d)
    
  climber_stats = pd.DataFrame(climber_dict)
  return climber_stats



def extract_information(csv_urls):
  for url in csv_urls:    # Download user csv's
    command = "wget "+url
    os.system(command)
  files = [f for f in os.listdir('.') if os.path.isfile(f)]
  climber_chunk_stats = pd.DataFrame()
  
  for file in files:
    try:
      ticks = create_user_ticks(file)
      if ticks.shape[0] == 0:
        continue
      climber_stats = create_climber_stats(ticks)
      climber_chunk_stats = pd.concat([climber_chunk_stats,climber_stats])
    except:
      continue
  return climber_chunk_stats

In [0]:
# Grab names to seed the search (most popular male and female names from 1985)
name_url = "https://www.weddingvendors.com/baby-names/popular/1985/?page=1" # vary page number for more samples, I collected up to 6 for the data in my github

source = requests.get(name_url).text
soup = BeautifulSoup(source,'html.parser')
names = soup.find_all('td', class_='n')
possible_users = []
for name in names:
  possible_user = name.text
  possible_users.append(possible_user)


In [0]:
url_start = "https://www.mountainproject.com/ajax/public/search/results/category?q="
url_end = "&c=Users&o=0&s=Default"

## WARNING: This next cell will take a while
(from my experience, running 200 names takes about an hour. A run time error occurs, but reconnecting and letting it go for another few minutes is fine

In [0]:
p = Pool(12)

climber_stats = pd.DataFrame()
for name in possible_users:
  query = url_start+name+url_end
  resp = requests.get(query)
  users = resp.json()['results']['Users']
  user_ids = [u[12:21] for u in users]
  csv_urls = p.map(get_csv_url,user_ids)
  climber_chunk_stats = extract_information(csv_urls)
  climber_stats = pd.concat([climber_stats,climber_chunk_stats])
  os.system('rm * -f') # clear the latest chunk of downloaded csv's to minimize space usage

In [0]:
climber_stats

Unnamed: 0,years_total,climbs_total,pitches_total,route_mean,route_max,boulder_mean,boulder_max,danger_factor,solos,hardest_solo,trad_count,sport_count,locations,success,quarter_max1,quarter_max2,quarter_max3,quarter_max4
0,0 days,2,3,7.750000,8.5,,,0.000000,0,,0,2,2,0,8.5,,,
0,0 days,3,3,8.500000,9.5,,,0.000000,0,,1,2,1,2,9.5,,,
0,0 days,1,1,9.000000,9.0,,,0.000000,0,,0,1,1,1,9.0,,,
0,761 days,6,9,8.466667,10.0,,,0.166667,0,,0,6,5,0,10.0,,,
0,0 days,4,4,,,0.600000,1.2,0.000000,0,,0,0,1,0,1.2,,,
0,2191 days,16,16,10.800000,12.0,3.975000,7.0,0.000000,0,,0,12,7,1,12.0,,,
0,0 days,2,8,5.500000,6.0,,,0.000000,0,,2,0,1,0,6.0,,,
0,141 days,5,5,7.800000,10.5,,,0.000000,0,,1,4,4,0,10.5,,,
0,273 days,18,19,8.133333,10.4,,,0.000000,0,,0,18,7,0,10.4,,,
0,1206 days,24,53,7.412500,11.0,,,0.041667,0,,23,1,15,0,11.0,8.5,,


In [0]:
climber_stats.to_csv('climber_data.csv') # create file to save

## Stuff Below pertains to individual data analysis, unrelated to the scraping above

In [0]:
url = "https://www.mountainproject.com/user/109791883/trevor-clack/tick-export"
#url = "https://www.mountainproject.com/user/106027958/brad-g/tick-export"
#url = "https://www.mountainproject.com/user/110296901/morgan-f/tick-export"
#url =  "https://www.mountainproject.com/user/106234022/stormeh/tick-export"
#url = "https://www.mountainproject.com/user/108797251/louis-mullerleile/tick-export" #louis
climber = pd.read_csv(url)
climber.head()

Unnamed: 0,Date,Route,Rating,Notes,URL,Pitches,Location,Avg Stars,Your Stars,Style,Lead Style,Route Type,Your Rating,Length,Rating Code
0,2019-08-31,A Midsummer's Night Seam,5.7,with John p.,https://www.mountainproject.com/route/10597714...,1,California > San Bernardino Mountains > Big Be...,1.8,-1,Lead,Onsight,Sport,,35.0,1800
1,2019-08-31,Firewater,5.5,with John p.,https://www.mountainproject.com/route/10598736...,1,California > San Bernardino Mountains > Big Be...,1.6,-1,Lead,Onsight,Sport,,35.0,1500
2,2019-08-31,Tombstone Shadow,5.10b,with John p.,https://www.mountainproject.com/route/10589319...,1,California > San Bernardino Mountains > Big Be...,3.0,-1,Lead,Onsight,Sport,,40.0,2900
3,2019-08-31,Dead Man Chalking,5.10b,with John p.,https://www.mountainproject.com/route/10593088...,1,California > San Bernardino Mountains > Big Be...,2.4,-1,Lead,Onsight,Sport,,40.0,2900
4,2019-08-31,Gold Bug,5.8,with John p.,https://www.mountainproject.com/route/10587894...,1,California > San Bernardino Mountains > Big Be...,2.0,-1,Lead,Onsight,Sport,,45.0,2100


In [0]:
ticks = create_user_ticks(url)
#ticks = ticks[ticks.lead_style == 'Onsight']
ticks.head()

Unnamed: 0,date,grade,pitches,style,lead_style,type,location,danger
0,2019-08-31,7.0,1,Lead,Onsight,Sport,California > San Bernardino Mountains > Big Be...,0
1,2019-08-31,5.0,1,Lead,Onsight,Sport,California > San Bernardino Mountains > Big Be...,0
2,2019-08-31,10.3,1,Lead,Onsight,Sport,California > San Bernardino Mountains > Big Be...,0
3,2019-08-31,10.3,1,Lead,Onsight,Sport,California > San Bernardino Mountains > Big Be...,0
4,2019-08-31,8.5,1,Lead,Onsight,Sport,California > San Bernardino Mountains > Big Be...,0


In [0]:
# Morgan's Routes
#command1 = "wget https://www.mountainproject.com/user/110296901/morgan-f/tick-export"
# My Routes
#command2 = "wget https://www.mountainproject.com/user/109791883/trevor-clack/tick-export"
# Brad G.
command3 = "wget https://www.mountainproject.com/user/106027958/brad-g/tick-export"
#Ryan Murphy
#command4 = "wget https://www.mountainproject.com/user/106234022/stormeh/tick-export"
# Louis - random strong climber
#https://www.mountainproject.com/user/108797251/louis-mullerleile
#os.system(command1)
#os.system(command2)
os.system(command3)
#os.system(command4)

In [0]:
monthly_tick_average = ticks[ticks.type != 'Boulder'][['date','grade']].set_index('date').resample('M',how='mean').dropna()
#monthly_tick_average['max'] = ticks[ticks.lead_style == 'Onsight'][['date','grade']].set_index('date').resample('M',how='max').dropna().grade
#monthly_tick_average = monthly_tick_average.reset_index()

In [0]:
#Individual plotting of Average Grades
# plt.style.use('fivethirtyeight')
# monthly_tick_average.plot(legend=False,color='g')
# plt.title("Ryan Average Monthly grade (routes)",);

In [0]:
# lead = ticks[(ticks['style'] == 'Lead') & (ticks.lead_style =='Onsight')]
# high_grades = lead[lead.grade >= 10]
# high_grades.grade.value_counts().sort_index()