In [2]:
# This notebook produces a dataset from the full August 2020 Open Beta database. In summary:
# (1) types are simplified, aid, ice, mixed (as in mixed ice and rock), and snow routes are removed
# (2) - grades are reformatted to yeild two new columns: YDS and Vermin, 3rd and 4th class routes are removed
#     - grades with +/- are created (i.e. 5.12- is converted to 5.12a/b and V9- is converted to V8/9)
# (3) unambiguous numerical grade ranks are added (see grade_rank_calculation.py)
# (4) metadata is extracted to new columns
# (5) rows with missing route names, IDs, or sector IDs are removed

import pandas as pd
import numpy as np
import re
from grade_rank_calculation import calculate_grade_rank

In [3]:
# all route data (does not have stars or vote numbers)
df = pd.read_json('../openbeta-usa-routes-aug-2020.zip', lines=True)

In [4]:
# (1) Removing unwanted types and reformatting/simplifying the types given

# tr and alpine are not useful types for my purposes
df['type_string'] = [' '.join(sorted([sty for sty in ty if sty not in ('tr', 'alpine')])) for ty in df.type] 

# not concerned with aid, ice, mixed, or snow routes
unused_types = ['aid', 'ice', 'mixed', 'snow']
df = df[~df.type_string.str.contains('|'.join(unused_types))]

# retyping multi-type climbs
df.loc[df.type_string == 'boulder trad', 'type_string'] = 'boulder' # this one is tricky, but most of these are dangerous boulders that could take gear
df.loc[df.type_string == 'boulder sport', 'type_string'] = 'sport' # if its gots bolts its sports
df.loc[df.type_string == 'sport trad', 'type_string'] = 'mixed' # these are usually called mixed in the rock game (not to be confused with ice/rock routes climbed with tools and pons)
df = df[df.type_string != 'boulder sport trad'] # "boulder sport trad" is just a stupid type (who put this on MP?), so I'm just removing it
df = df[df.type_string != ''] # empty types are removed

print('simplified types are:', set(df.type_string))
print(len(df.index), 'routes after type filter removing aid, ice, mixed (as in ice + rock), snow, and untyped routes')

simplified types are: {'mixed', 'boulder', 'trad', 'sport'}
171237 routes after type filter removing aid, ice, mixed (as in ice + rock), snow, and untyped routes


In [5]:
# (2) Grade formatting and removing unwanted grades (3rd and 4th), simplifying to YDS and Vermin only

# For some reason V grades are listed under YDS, which is incorrect, this is fixed
df['YDS_or_Vermin'] = [d.get('YDS') for d in df.grade]
df = df[df.YDS_or_Vermin.notnull()]
unused_grades = ['easy', '3rd', '4th']
df = df[~df.YDS_or_Vermin.str.contains('|'.join(unused_grades))]
required_grades = ['5.', 'V']
df = df[df.YDS_or_Vermin.str.contains('|'.join(required_grades))]

df.loc[df.YDS_or_Vermin == 'Easy 5th', 'YDS_or_Vermin'] = '5.0' # calling easy 5th 5.0
df.loc[df.YDS_or_Vermin == 'V-easy', 'YDS_or_Vermin'] = 'V0-' # will be converted to VB in the next cell, VB is the correct grade for this scenario

df['Vermin'] = np.where(df.YDS_or_Vermin.str.contains('V'), df.YDS_or_Vermin, None)
df['YDS'] = np.where(df.YDS_or_Vermin.str.contains('5.', regex=False), df.YDS_or_Vermin, None)
df = df[df.Vermin != 'V?'] # why would you ever use this grade? It is the same as not grading

# +/- grades are converted to grade ranges, this conversion is added as a new column, this simplifies grade rankings
# for YDS <= 5.9:
# 5.X- -> 5.X-1/X
# 5.X+ -> 5.X/X+1
# for YDS > 5.9:
# 5.X- -> 5.Xa/b
# 5.X+ -> 5.Xc/d
# for Vermin:
# VX+ -> VX/X+1
# VX- -> VX-1/X

def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False
    
def convert_pm_grading(grade):
    
    if grade == None:
        return grade
    
    # YDS condition
    if '5.' in grade:
        num = int(re.sub('[^0-9]', '', grade.split('.')[-1]))
                
        # routes less than 5.10
        if num < 10:
            if '+' in grade:
                ran = str(num) + '/' + str(num + 1)
                if num == 9:
                    ran = '9+' # keep the (in)famous 5.9+
            elif '-' in grade:
                ran = str(num - 1) + '/' + str(num)
            else:
                return grade
            
        # routes greater than 5.9
        if num > 9:
            if '+' in grade:
                ran = str(num) + 'c/d'
            elif '-' in grade:
                ran = str(num) + 'a/b'
            else:
                if isfloat(grade.split('.')[-1]): # e.g. 5.11 is converted to 5.11b/c
                    return grade + 'b/c'
                else:
                    return grade
        
        new_grade = '5.' + ran
            
    # Vermin condition
    if 'V' in grade:
        
        range_comp = grade.split('-')
        
        if len(range_comp) > 1 and range_comp[-1] != '': # check if a range grade is already given
            range_grade = True
            return re.sub('[-]', '/', grade)
        else:
            range_grade = False
        
        if not range_grade:
            num = int(re.sub('[^0-9]', '', grade))
            
            if '+' in grade:
                ran = str(num) + '/' + str(num + 1)
            elif '-' in grade:
                ran = str(num - 1) + '/' + str(num)
            else:
                return grade
        
            new_grade = 'V' + ran
            
            if new_grade == 'V-1/0': # V0- is converted to VB (who is grading boulders V0-?)
                return 'VB'
    
    return new_grade

df['nopm_YDS'] = df.apply(lambda row: convert_pm_grading(row['YDS']), axis = 1)
df['nopm_Vermin'] = df.apply(lambda row: convert_pm_grading(row['Vermin']), axis = 1)

print(len(df.index), 'routes after grade filters removing 3rd and 4th class and unrated climbs')

168933 routes after grade filters removing 3rd and 4th class and unrated climbs


In [6]:
# (3) grade integer assignment, from Viet's implementation for YDS grades, this function works for all YDS and Vermin grade formats seen in the dataset

df['YDS_rank'] = df.apply(lambda row: calculate_grade_rank(row['YDS']), axis = 1)
df['Vermin_rank'] = df.apply(lambda row: calculate_grade_rank(row['Vermin']), axis = 1)                           

In [7]:
# (4) Extracting metadata to columns

df['route_ID'] = [d.get('mp_route_id') for d in df.metadata]
df['sector_ID'] = [d.get('mp_sector_id') for d in df.metadata]
df['parent_loc'] = [d.get('parent_lnglat') for d in df.metadata]
df['parent_sector'] = [d.get('parent_sector') for d in df.metadata]

In [9]:
# (5) removing more missing data

df = df[df['route_name'] != '']
df = df[df['route_ID'] != '']
df = df[df['sector_ID'] != '']
df['route_ID'].astype(int)

print(len(df.index), 'routes after general missing filters')

168910 after general missing filters


In [10]:
# (6) Taking columns I want and making/saving new dataframe

clean_df = df[['route_name', 'parent_sector', 'route_ID', 'sector_ID', 
               'type_string', 'fa', 'YDS', 'Vermin', 'nopm_YDS', 'nopm_Vermin', 'YDS_rank', 'Vermin_rank', 
               'safety', 'parent_loc', 'description', 'location', 'protection']].copy()

clean_df.to_pickle('Curated_OpenBetaAug2020_RytherAnderson.pkl.zip', compression='zip')