In [2]:
import itertools
import pandas as pd
from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
climb = pd.read_csv('_climb_test', index_col=0)

In [4]:
def scale01(feature):
    # always use on an entire column
    ecdf = ECDF(feature.dropna())

    # fit these observations
    qtile = ecdf(feature)
    scaled_feature = pd.Series(qtile, index=feature.index)
    
    # ECDF says NaN is 1.0
    scaled_feature[pd.isnull(feature)] = float('NaN')
    
    return scaled_feature

In [5]:
def create_scorer(col, feature):
    
    if col in ['aid','alpine','boulder','sport','trad','ice']:

        # turn boolean into value of 1
        casted = pd.notnull(feature)
        scaled = casted.astype(int)
        
        def scorer(grade, ideal):
            return ideal * grade
        
        return casted, scaled, scorer
    
    elif col in ['feet', 'pitches', 'gradeComb']:
        
        casted = feature.astype(float)
        scaled = scale01(casted)
        
        def scorer(grade, ideal):
            return 1 - abs(ideal - grade)
        
        return casted, scaled, scorer
    
    elif col in ['staraverage', 'starvotes']:
        
        casted = feature.astype(float)
        scaled = scale01(casted)
        
        def scorer(grade, ideal):
            return grade
        
        return casted, scaled, scorer
    
    elif col in ['description']:
        
        # isolate description we care about
        combined = climb['description'].astype(str) + climb['other_text'].astype(str)
        descriptive = combined.tolist()

        # lemmatize, tokenize, vectorize text
        tfidf = TfidfVectorizer(
            decode_error='ignore', stop_words='english',
            # max_df = 0.5,
            sublinear_tf=True, ngram_range=(1, 2))
        TFIDF = tfidf.fit_transform(descriptive)

        # reduce dimensionality
        svd = TruncatedSVD(
            n_components=100, random_state=42)
        shrunk = svd.fit_transform(TFIDF)
        shrunk = pd.DataFrame(shrunk, index=feature.index)

        def scorer(shrunk, ideal):
            sim = cosine_similarity(ideal, shrunk)
            sim_score = pd.Series(sim[0], index=climb.index)
            return sim_score.tolist()
        
        return combined, shrunk, scorer
    
    else:
        return None, None, None

In [6]:
def create_recommendation_system(climb):

    FIT = {}
    for col, feature in climb.iteritems():
        casted, scaled, scorer = create_scorer(col, feature)
        if casted is not None:
            scoring_package = { col: [casted, scaled, scorer] }
            FIT.update(scoring_package)
            
    return FIT

In [7]:
FIT = create_recommendation_system(climb)

In [8]:
href = '/v/empire-of-the-fenceless/105756790'

In [9]:
def give_recommendation(FIT, href, top=100):
    """Given fit of the various columns as a dict, score this href """
    score_collect = {}
    for col, fit in FIT.items():
        casted, scaled, scorer = fit
        ideal = scaled.loc[href]
        if len(scaled.shape) == 1:
            score = map(scorer, scaled, itertools.repeat(ideal, len(scaled)))
        else:
            score = scorer(scaled, ideal)
            
        score_collect.update({ col: score })

    charlie = pd.DataFrame(score_collect, index=climb.index)
    
    # multiply weights
    charlie['description'] = charlie['description'] * 35
    charlie['gradeComb'] = charlie['gradeComb'] * 12
    charlie['feet'] = charlie['feet'] * 10
    charlie['pitches'] = charlie['pitches'] * 3
    charlie['staraverage'] = charlie['staraverage'] * 15
    charlie['starvotes'] = charlie['starvotes'] * 5
    charlie['boulder'] = charlie['boulder'] * 2
    charlie['sport'] = charlie['sport'] * 2
    charlie['trad'] = charlie['trad'] * 2
    charlie['ice'] = charlie['ice'] * 4
    
    charlie['best'] = charlie.sum(axis=1)
    charlie.sort_values('best', ascending=False, inplace=True)

    return charlie[:top]

In [44]:
top_recco = give_recommendation(FIT, href)



In [55]:
top_recco = give_recommendation(FIT, href)
top_recco['suggestion_for'] = href
with open('top_recco.csv', 'wb') as f:
    top_recco.to_csv(f, header=False, index_col=True)



In [16]:
import re
def find_id(href):
    return re.search(r'(\d+)$',href).group(1)
climb['href_id'] = map(find_id, climb['href'])

In [17]:
climb['url'] = 'http://www.mountainproject.com' + climb['href']

In [21]:
scores = top_recco[['description','gradeComb','staraverage','best']]

info = climb[['href_id','name','url','rateYDS','feet']]
jnd = pd.concat([info, scores], axis=1, join='inner')
jnd = jnd.where((pd.notnull(jnd)), None)
tuples = [tuple(x) for x in jnd.values[:5]]
print tuples

[('105756790', 'Empire of the Fenceless', 'http://www.mountainproject.com/v/empire-of-the-fenceless/105756790', '5.12a', None, 35.0, 12.0, 13.872151031986698, 70.84834849522319), ('106207686', 'The Shaft', 'http://www.mountainproject.com/v/the-shaft/106207686', '5.12b', 90.0, 19.073171038164073, 11.623219856232197, 14.270272914017413, 54.863302107673526), ('105752446', 'Subterfuge', 'http://www.mountainproject.com/v/subterfuge/105752446', '5.11d', 40.0, 19.423500540256736, 12.0, 15.0, 54.82318752059519), ('105749779', 'Global Gorilla', 'http://www.mountainproject.com/v/global-gorilla/105749779', '5.12c', 170.0, 17.425296643334462, 11.239115692391156, 13.562065929766215, 52.083173951689616), ('106412546', 'Bridge of Air', 'http://www.mountainproject.com/v/bridge-of-air/106412546', '5.12-', 135.0, 16.842484525597815, 12.0, 14.270272914017413, 51.512444419953674)]


In [19]:
jnd.columns

Index([u'href_id', u'name', u'url', u'rateYDS', u'feet', u'description',
       u'gradeComb', u'staraverage', u'best'],
      dtype='object')