In [1]:
%matplotlib inline

In [1]:
import pandas as pd

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
from statsmodels.distributions.empirical_distribution import ECDF

In [4]:
import pandas as pd
import numpy as np
import re

In [5]:
import itertools

In [8]:
DATA_DIR = '../utah_data/'

In [11]:
print 'Reading climb dataframe pickle from ' + DATA_DIR
climb = pd.read_pickle(DATA_DIR + '_climb_no_children')
print "Shape of climb dataframe is", climb.shape

Reading climb dataframe pickle from ../utah_data/
Shape of climb dataframe is (37001, 42)


In [12]:
href = '/v/pocket-rocket/106297965'
# TODO check if href in climb
href in climb.index

True

In [13]:
def scale01(feature):
    # always use on an entire column
    ecdf = ECDF(feature.dropna())
    qtile = ecdf(feature)
    scaled_feature = pd.Series(qtile, index=feature.index)
    
    # ECDF says NaN is 1.0
    scaled_feature[scaled_feature == 1.0] = float('NaN')
    
    return scaled_feature

In [14]:
def convert_hueco(hueco, LEAST_DIFFICULT=float(0), FIRST_STEP=float(1)):
    """ Takes string grade and converts it to a float """

    if not pd.isnull(hueco):
        
        # trim the prefix before the core grade
        pre = 0
        if re.search(r'^V', hueco):
            pre = 1
        elif re.search(r'^5\.', hueco):
            pre = 2
        hueco = hueco[pre:]
        
        # some special easy cases
        if hueco in ['-easy', '3rd', '4th', 'Easy 5th']:
            # lower bound
            hard = LEAST_DIFFICULT
        else:
            # V0 will be equal to this
            hard = LEAST_DIFFICULT + FIRST_STEP
            # factor in the +/- or abcd
            hard += cast_grade(hueco)
            
        return hard

In [15]:
def cast_grade(hueco):
    """ Deals with grades with endings like abcd and +/- """
    
    # abcd grades YDS grades fall here
    if re.search(r'[abcd]$', hueco):
        pnt = float(hueco.strip('abcd/'))
        adj = adj_from_letter(hueco)
        rate = pnt + adj
    
    # anything ending in +/-
    elif re.search(r'(\+$)|(-$)', hueco):
        pnt = float(hueco[:-1].strip())
        # add/take a half a point off for plus minus
        if hueco[-1] == '+':
            rate = pnt + .5
        else:
            rate = pnt - .5
    
    # hueco ratings often have a range like V3-4
    elif re.search(r'(\d+)[- ]+(\d+)', hueco):
        # return the mean of range
        lower = re.findall(r'\d+', hueco)[0].strip()
        upper = re.findall(r'\d+', hueco)[1].strip()
        rate = np.mean( [float(lower), float(upper)] )
    
    # if we can find any number cast as float ignoring sign
    elif re.search(r'\d+', hueco):
        try:
            core_rate = float(hueco.strip('+-/'))
        except:
            rate = None
        else:
            rate = core_rate
    
    # else nothing recognizable
    else:
        rate = None
        
    return rate

In [16]:
def adj_from_letter(grade):
    letter_jumble = re.sub('[\W\d_]+', '', grade)
    adj = []
    for letter in letter_jumble:
        if letter == 'a':
            adj.append(-.5)
        elif letter == 'b':
            adj.append(-.25)
        elif letter == 'c':
            adj.append(+.25)
        elif letter == 'd':
            adj.append(+.5)
            
    # mean not sum beacuse b/c is easier than c
    return np.mean(adj)

In [17]:
def combine_grade(climb):
    """ Allows mixing of Bouldering and Sport/Trad routes """
    climb['floatHueco'] = map(convert_hueco, climb['rateHueco'])
    climb['pctHueco'] = scale01(climb['floatHueco'])

    climb['floatYDS'] = map(convert_hueco, climb['rateYDS'])
    climb['pctYDS'] = scale01(climb['floatYDS'])

    # might prefer the YDS-esque rating if there is one
    # not many conflicting cases -- max is reasonable assuption
    climb['gradeComb'] = climb[['pctHueco','pctYDS']].max(axis='columns')
    
    return climb['gradeComb']

In [18]:
# TODO graph cumsum percentile of climbing difficulty and YDS

In [19]:
def score(grade, ideal):
    return 1 - abs(ideal - grade)

In [20]:
def castscale(feature_name, climb, href, diff = True):
    casted = climb[feature_name].astype(float)
    scaled = scale01(casted)
    ideal = scaled.loc[href]
    
    if diff:
        scored = map(score, scaled, itertools.repeat(ideal, len(casted)))
        recco = pd.Series(scored, index=climb.index)
    else:
        recco = pd.Series(scaled, index=climb.index)
    return recco

In [21]:
# preprocessing of grades
climb['gradeComb'] = combine_grade(climb)

In [22]:
def combine_text(jess):
        
    # TODO do this with a map
    txt_collect = []
    for txt in ['description', 'other_text']:
        if hasattr(jess, txt):
            if not pd.isnull(jess[txt]):
                txt_collect.append(jess[txt])

    return "\n".join(txt_collect)

def get_total_climb_description(climb):
    collect = []
    for href, cmb in climb.iterrows():
        cmbtxt = combine_text(cmb)
        collect.append(cmbtxt)

    print "Preprocessed %d text segments" % len(collect)

    return collect

In [23]:
descriptive = get_total_climb_description(climb)

Preprocessed 37001 text segments


In [52]:
combined_text = climb['description'].astype(str) + climb['other_text'].astype(str)
combined_text.tolist()

['10 Mile Canyon is the stretch of road on I-70 between Frisco and Copper Mountain. There are several crags on this 5 mile section of I-70. The rock is is a mixture of alpine gneiss and granite with varying degrees of quality. Much cleaning is needed to establish new routes in this area.  Click here for an archived guide to the area.',
 "The Diamond Wall is located at the base of the multi-tiered wall above the bike path parking lot at the Officer's Gulch exit. This crag is in the shade until late in the day and only host [3] challenging sport pitches.... A 500' slab to the left offers many moderate multi-pitch options with very little protection... this  area has potential for many more routes, but the amount of cleaning and work have inhibited any development on this massif of walls. Helmets are highly recommended!AB. Cosmic Charlie, 11-, 1p, 95', bolts. B. Lucy, 11-, 1p, 95', bolts. C. The Plug, 10, 1p, 95', bolts. D. Bombs Away Dad, 6 X.",
 "Cosmic Charlie starts out left off the s

In [42]:
climb[pd.isnull(climb['description'])]['description'] = ''
climb[pd.isnull(climb['other_text'])]['other_text'] = ''

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [24]:
tfidf = TfidfVectorizer(
    decode_error='ignore', stop_words='english',
    sublinear_tf=True, ngram_range=(1, 2))
svd = TruncatedSVD(
    n_components=100, random_state=42)

In [25]:
TFIDF = tfidf.fit_transform(descriptive)

In [26]:
shrunk = svd.fit_transform(TFIDF)

In [27]:
shrunk.shape

(37001, 100)

In [28]:
pos = climb.index == href
sim = cosine_similarity(shrunk[pos], shrunk)
sim_score = pd.Series(sim[0], index=climb.index)

In [None]:
def score_boolean(grade, ideal):
    return grade * ideal

In [None]:
climb.columns

In [None]:
def match_type(climb, href):
    """ Give scores for climbs that match bouldering, sport, trad, etc."""
    
    # these columns are True or NaN
    type_of_route = { 'aid': 1, 'alpine': 1, 'boulder': 1, 'sport': 1, 'trad': 1 , 'ice': 1 }
    
    collect = []
    for k, v in type_of_route.items():

        # turn boolean into value of 1
        scaled = pd.notnull(climb[k]).astype(int)
        ideal = climb.loc[href][k]

        # score destinations relative to ideal
        scored = map(score_boolean, scaled, itertools.repeat(ideal, len(scaled)))
        weighted = pd.Series(np.multiply(scored, v), index=climb.index)

        collect.append(weighted)

    # combine dimensions into one score
    all_type_score = pd.concat(collect, axis = 1)
    type_score = all_type_score.sum(axis='columns')
    return type_score / max(type_score)

In [None]:
type_score = match_type(climb, href)

In [None]:
# score columns!
height_score = castscale('feet', climb, href)
grade_score = castscale('gradeComb', climb, href)
star_score = castscale('staraverage', climb, href, False)
vote_score = castscale('starvotes', climb, href, False)

In [None]:
# aggregate scores
charlie = pd.DataFrame({
        'height': height_score,
        'grade': grade_score,
        'stars': star_score,
        'votes': vote_score,
        'sim': sim_score,
        'type': type_score
    })

# multiply weights
charlie['sim'] = charlie['sim'] * 25
charlie['grade'] = charlie['grade'] * 15
charlie['height'] = charlie['height'] * 5
charlie['stars'] = charlie['stars'] * 15
charlie['votes'] = charlie['votes'] * 5
charlie['type'] = charlie['type'] * 10


charlie['best'] = charlie.sum(axis='columns')
charlie.sort_values('best', ascending=False, inplace=True)


In [None]:
recco_href = charlie.index[:10]
climb.loc[recco_href][['rateYDS','rateHueco','staraverage','votes','description']]