In [1]:
%matplotlib inline

In [17]:
import pickle
import pandas as pd
import numpy as np
import Destination as Destination

In [18]:
DATA_DIR = '../utah_data/'

In [19]:
print 'Reading climb dataframe pickle from ' + DATA_DIR
climb = pd.read_pickle(DATA_DIR + '_climb_dataframe')
print "Shape of climb dataframe is", climb.shape

Reading climb dataframe pickle from ../utah_data/
Shape of climb dataframe is (37001, 43)


In [21]:
type(climb)

pandas.core.frame.DataFrame

In [22]:
# isolate grade columns
import re
rate = [col for col in climb.columns if re.search(r'^rate', col)]
grade = climb[rate]
grade.shape

(37001, 8)

In [24]:
grade = grade.dropna(axis=0, how='all')

In [135]:
import numpy as np
def convert_hueco(hueco, LEAST_DIFFICULT=float(0), FIRST_STEP=float(1)):
    """ Takes string grade and converts it to a float """

    if not pd.isnull(hueco):
        
        # trim the prefix before the core grade
        pre = 0
        if re.search(r'^V', hueco):
            pre = 1
        elif re.search(r'^5\.', hueco):
            pre = 2
        hueco = hueco[pre:]
        
        # some special easy cases
        if hueco in ['-easy', '3rd', '4th', 'Easy 5th']:
            # lower bound
            hard = LEAST_DIFFICULT
        else:
            # V0 will be equal to this
            hard = LEAST_DIFFICULT + FIRST_STEP
            # factor in the +/- or abcd
            hard += cast_grade(hueco)
            
        return hard

In [144]:
def cast_grade(hueco):
    """ Deals with grades with endings like abcd and +/- """
    
    # abcd grades YDS grades fall here
    if re.search(r'[abcd]$', hueco):
        pnt = float(hueco.strip('abcd/'))
        adj = adj_from_letter(hueco)
        rate = pnt + adj
    
    # anything ending in +/-
    elif re.search(r'(\+$)|(-$)', hueco):
        pnt = float(hueco[:-1].strip())
        # add/take a half a point off for plus minus
        if hueco[-1] == '+':
            rate = pnt + .5
        else:
            rate = pnt - .5
    
    # hueco ratings often have a range like V3-4
    elif re.search(r'(\d+)[- ]+(\d+)', hueco):
        # return the mean of range
        lower = re.findall(r'\d+', hueco)[0].strip()
        upper = re.findall(r'\d+', hueco)[1].strip()
        rate = np.mean( [float(lower), float(upper)] )
    
    # if we can find any number cast as float ignoring sign
    elif re.search(r'\d+', hueco):
        try:
            core_rate = float(hueco.strip('+-/'))
        except:
            rate = None
        else:
            rate = core_rate
    
    # else nothing recognizable
    else:
        rate = None
        
    return rate

In [149]:
convert_hueco('V0-')

0.5

In [150]:
def adj_from_letter(grade):
    letter_jumble = re.sub('[\W\d_]+', '', grade)
    adj = []
    for letter in letter_jumble:
        if letter == 'a':
            adj.append(-.5)
        elif letter == 'b':
            adj.append(-.25)
        elif letter == 'c':
            adj.append(+.25)
        elif letter == 'd':
            adj.append(+.5)
            
    # mean not sum beacuse b/c is easier than c
    return np.mean(adj)

In [151]:
convert_hueco('V3-4')

4.5

In [152]:
convert_hueco('10+')

11.5

In [156]:
convert_hueco('11')

12.0

In [154]:
climb['floatHueco'] = map(convert_hueco, climb['rateHueco'])

In [155]:
np.median(climb['floatHueco'].dropna())

4.0

In [145]:
def convert_ZA(ZA):
    if not pd.isnull(ZA):
        return float(ZA)
# convert_lead(climb['floatZA'])

In [146]:
from statsmodels.distributions.empirical_distribution import ECDF
def scale01(feature):
    # use on an entire column
    ecdf = ECDF(feature.dropna())
    qtile = ecdf(feature)
    qtile[qtile == 1.0] = float('NaN')
    return qtile

In [147]:
climb['pctHueco'] = scale01(climb['floatHueco'])
climb['pctZA'] = scale01(climb['floatZA'])

In [148]:
climb['gradeComb'] = climb[['pctHueco','pctZA']].mean(axis='columns')

In [149]:
# climb['gradeComb'].hist()

In [150]:
# IDEAL IS A BETA DISTRIBUTION FOR PRIOR

In [151]:
def score_climb_grade(grade, ideal_grade):
    ABOVE_DECAY = 5
    BELOW_DECAY = 2
    if grade == ideal_grade:
        return 1.0
    elif grade > ideal_grade:
        diff = grade - ideal_grade
        return (1-diff) ** ABOVE_DECAY
    else:
        diff = ideal_grade - grade
        return (1-diff) ** BELOW_DECAY

# grade = np.array([.5,.7,.3,.6,.9])

In [165]:
sample = climb[:10]


/v/beaver-boulder/108131427                          NaN
/v/spider-man-/108131436                        0.421247
/v/burr-trail--long-canyon/105973758                 NaN
/v/58-corner/106448362                          0.493234
/v/bastard-child/105973780                      0.509457
/v/beam-bump/105973810                          0.304860
/v/flipper-babies/105973765                     0.493234
/v/longs-lies/105973794                         0.693831
/v/capitol-reef-national-park/105716802              NaN
/v/basketball-wallslickrock-divide/105968325         NaN
Name: gradeComb, dtype: float64

In [142]:
from time import time

In [172]:
def grade_similarity(climb):
    
    collect = []
    for ideal_grade in climb['gradeComb']:
        scores = [score_climb_grade(g, ideal_grade) for g in climb['gradeComb']]
        recco = pd.DataFrame({'recco':scores}, index=climb.index)
        collect.append(recco)
    grade_matrix = pd.concat(collect, axis='columns')
    grade_matrix.columns = climb.index
    return grade_matrix

In [173]:
print 'Converting grades to float...'
t0 = time()
climb['floatHueco'] = map(convert_hueco, climb['rateHueco'])
climb['pctHueco'] = scale01(climb['floatHueco'])

climb['floatZA'] = map(convert_ZA, climb['rateZA'])
climb['pctZA'] = scale01(climb['floatZA'])

climb['gradeComb'] = climb[['pctHueco','pctZA']].mean(axis='columns')
print 'took %0.2f seconds' % (time() - t0)

print 'Generating grade similarity matrix...'
t0 = time()
grade_matrix = grade_similarity(climb)
print 'took %d seconds' % (time() - t0)

Generating grade similarity matrix
Cross Grading 1000 records took 1 seconds


In [175]:
grade_matrix.to_pickle('_grade_matrix')
x = pd.read_pickle('_grade_matrix')

print "X is", x.shape, type(x)

X is (1000, 1000) <class 'pandas.core.frame.DataFrame'>
