In [42]:
import itertools
import pandas as pd
from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [43]:
climb = pd.read_csv('_climb_test', index_col=0)

In [44]:
climb['all_description'] = climb['description'].astype(str) + climb['other_text'].astype(str)

In [45]:
def scale01(feature):
    # always use on an entire column
    ecdf = ECDF(feature.dropna())

    # fit these observations
    qtile = ecdf(feature)
    scaled_feature = pd.Series(qtile, index=feature.index)
    
    # ECDF says NaN is 1.0
    scaled_feature[pd.isnull(feature)] = float('NaN')
    
    return scaled_feature

In [46]:
def create_scorer(col, feature):
    
    if col in ['aid','alpine','boulder','sport','trad','ice']:

        # turn boolean into value of 1
        casted = pd.notnull(feature)
        scaled = casted.astype(int)
        
        def scorer(grade, ideal):
            return ideal * grade
        
        return casted, scaled, scorer
    
    elif col in ['feet', 'pitches', 'gradeComb']:
        
        casted = feature.astype(float)
        scaled = scale01(casted)
        
        def scorer(grade, ideal):
            return 1 - abs(ideal - grade)
        
        return casted, scaled, scorer
    
    elif col in ['staraverage', 'starvotes']:
        
        casted = feature.astype(float)
        scaled = scale01(casted)
        
        def scorer(grade, ideal):
            return grade
        
        return casted, scaled, scorer
    
    elif col in ['description']:
        
        # isolate description we care about
        combined = climb['description'].astype(str) + climb['other_text'].astype(str)
        descriptive = combined.tolist()

        # lemmatize, tokenize, vectorize text
        tfidf = TfidfVectorizer(
            decode_error='ignore', stop_words='english',
            # max_df = 0.5,
            sublinear_tf=True, ngram_range=(1, 2))
        TFIDF = tfidf.fit_transform(descriptive)

        # reduce dimensionality
        svd = TruncatedSVD(
            n_components=100, random_state=42)
        shrunk = svd.fit_transform(TFIDF)
        shrunk = pd.DataFrame(shrunk, index=feature.index)

        def scorer(shrunk, ideal):
            sim = cosine_similarity(ideal, shrunk)
            sim_score = pd.Series(sim[0], index=climb.index)
            return sim_score.tolist()
        
        return combined, shrunk, scorer
    
    else:
        return None, None, None

In [47]:
def create_recommendation_system(climb):

    FIT = {}
    for col, feature in climb.iteritems():
        casted, scaled, scorer = create_scorer(col, feature)
        if casted is not None:
            scoring_package = { col: [casted, scaled, scorer] }
            FIT.update(scoring_package)
            
    return FIT

In [48]:
FIT = create_recommendation_system(climb)

In [52]:
href = '/v/empire-of-the-fenceless/105756790'

In [53]:
def give_recommendation(FIT, href):
    """Given FIT of the various columns as a dict, score this href """
    score_collect = {}
    for col, fit in FIT.items():
        print col
        casted, scaled, scorer = fit
        ideal = scaled.loc[href]
        if len(scaled.shape) == 1:
            score = map(scorer, scaled, itertools.repeat(ideal, len(scaled)))
        else:
            score = scorer(scaled, ideal)
            
        score_collect.update({ col: score })

    charlie = pd.DataFrame(score_collect, index=climb.index)
    
    # multiply weights
    charlie['description'] = charlie['description'] * 40
    charlie['gradeComb'] = charlie['gradeComb'] * 12
    charlie['feet'] = charlie['feet'] * 5
    charlie['pitches'] = charlie['pitches'] * 3
    charlie['staraverage'] = charlie['staraverage'] * 15
    charlie['starvotes'] = charlie['starvotes'] * 5
    charlie['boulder'] = charlie['boulder'] * 2
    charlie['sport'] = charlie['sport'] * 2
    charlie['trad'] = charlie['trad'] * 2
    charlie['ice'] = charlie['ice'] * 4
    
    
    charlie['best'] = charlie.sum(axis=1)
    charlie.sort_values('best', ascending=False, inplace=True)

    print charlie[:10]
    return charlie.index

In [54]:
give_recommendation(FIT, href)

boulder
trad
description
staraverage
gradeComb
pitches
starvotes
feet
ice
aid
sport
alpine




                                      aid  alpine  boulder  description  feet  \
/v/empire-of-the-fenceless/105756790    0       0        0    40.000000   NaN   
/v/subterfuge/105752446                 0       0        0    22.198286   NaN   
/v/the-shaft/106207686                  0       0        0    21.797910   NaN   
/v/global-gorilla/105749779             0       0        0    19.914625   NaN   
/v/bridge-of-air/106412546              0       0        0    19.248554   NaN   
/v/castles-made-of-sand/105753130       0       0        0    19.167029   NaN   
/v/temptation-arete/105758074           0       0        0    18.176077   NaN   
/v/lost-planet-airman/105750052         0       0        0    18.021196   NaN   
/v/wet-dream/105748415                  0       0        0    16.275296   NaN   
/v/jewel-of-the-wild/105753214          0       0        0    17.274108   NaN   

                                      gradeComb  ice  pitches  sport  \
/v/empire-of-the-fenceless/105756790

Index([u'/v/empire-of-the-fenceless/105756790', u'/v/subterfuge/105752446',
       u'/v/the-shaft/106207686', u'/v/global-gorilla/105749779',
       u'/v/bridge-of-air/106412546', u'/v/castles-made-of-sand/105753130',
       u'/v/temptation-arete/105758074', u'/v/lost-planet-airman/105750052',
       u'/v/wet-dream/105748415', u'/v/jewel-of-the-wild/105753214',
       ...
       u'/v/gods-cave/106347664', u'/v/boulder-c/106353478',
       u'/v/portwoods-cave/107697060', u'/v/el-diablo-wall/105837128',
       u'/v/layback-cave/110764633', u'/v/hall-of-justice/107954778',
       u'/v/crack-hole/107448813', u'/v/big-mouth-crack-area/107340986',
       u'/v/utah-hills/109583182', u'/v/derailed-train/105804702'],
      dtype='object', length=37001)