In [1]:
import re
import math
import string
import pickle
import numpy as np
import pandas as pd

from numpy.linalg import norm
from nltk import word_tokenize
from gensim.models import Doc2Vec
from gensim.parsing.preprocessing import remove_stopwords



## Notes
### Route Search
* Routes can be searched based on similarity to phrases, while also subsetting for location, quality, type, grade, etc.
* For example, the user could search for 13a sport routes with "sustained crimps" in their area and > 3 RQI.
* This relies on quality (i.e. descriptive) descriptions, which some routes have and some don't.

### Route Profiles
* The doc2vec model can be used to create route profiles based on the route description similarity to keywords.
* For example, similarity to "crimp", "sloper", "jug", etc. can be used to profile the holds of a route.
* Style can also be profiled using keywords like "sustained", "powerful", "thin", etc.
* I have gravitated towards using a word2vec model for this purpose, but the doc2vec model is another (similar) option

In [2]:
# load the data, model, and key for accessing route IDs

model = Doc2Vec.load('doc2vec.model')

with open('docID_2_routeID.pkl', 'rb') as key:
    routeID_key = pickle.load(key)
    
df = pd.read_pickle('Curated_OpenBetaAug2020_RytherAnderson.pkl.zip', compression='zip')

In [3]:
# cleaning function, to clean input strings
def clean_desc(desc):
    
    """
        cleans descriptions for use with doc2vec model
    """
    
    desc = str(desc).lower() # lowercase
    desc = remove_stopwords(desc)
    desc = re.sub(r'\s+', ' ', desc) # multiple spaces converted to single spaces
    desc = re.sub('[0-9]', '', desc) # remove digits
    desc = re.sub(r'(?<=\w)-(?=\w)', ' ', desc) # dash replaced with space
    desc = re.sub(f'[{re.escape(string.punctuation)}]', '', desc) # remove punctuation and special characters
    
    tokens = word_tokenize(desc)
    tokens = [t for t in tokens if len(t) > 1] # remove short tokens

    return tokens

# search function, compares to documents in the provided model
def description_search(model, desc, routeID_key, route_data, topn=3):
    
    """
        model is the doc2vec model, desc is the description
        returns all the data (contained in route_data) for the top N routes
    """
    
    tokens = clean_desc(desc)
    inferred_vector = model.infer_vector(tokens, epochs=100)
    sims = model.dv.most_similar([inferred_vector], topn=topn)
    data = route_data[route_data['route_ID'].isin([routeID_key[dID] for dID, sim in sims])].copy()
    
    return data, [sim for dID, sim in sims], desc

# writes out relevant data from a route search result, for testing
def parse_search_results(res):
    
    """
        res is the output of the description search function
    """
    
    df, sims, desc = res
    N = len(sims)
    
    print('Results for the following document:')
    print()
    print('"' + desc + '"')
    print()
    print(f'{N} routes returned')
    for data, sim in zip([data for i,data in df.iterrows()], sims):
        print('------------------------------------------------------------------------------------------------------------------------')
        print('{:<40} {:<7} {:<5} {:<10}'.format('Name', 'Type', 'Grade', 'ID'))
        grade = ' '.join([g for g in (data['YDS'], data['Vermin']) if g != None])
        print('{:<40} {:<7} {:<5} {:<10}'.format(data['route_name'][0:40], data['type_string'], grade, data['route_ID']))
        print()
        print(f'DESCRIPTION (similarity = {np.round(sim,3)}):')
        
        desc = ' '.join(data['description'])
        NC = len(desc)
        ceil100 = math.ceil(NC/120)
        
        formatted = []
        for i in range(ceil100):
            print(desc[i*120:(i+1)*120])
    print('------------------------------------------------------------------------------------------------------------------------')
    

In [4]:
# test some longer phrases

TP0 = [
'Clean vertical face with small crimps',
'Follow perfect crimps and edges up the vertical face.',
'Make a huge deadpoint off these jugs to a half pad crimp, match, and finish on jugs.',
'A long, thin finger crack.',
'This route may cause you to void your bowels.',
'Just a really massive dyno.',
'Thin face climb up the black streak. Follow sustained edges and crimps to the top.',
'Compression up the refrigerator block to a big move at the lip.',
'A big, scary, runout slab',
'At the bulge, pull through the shouldery gaston crux to arrive at the chains.'
]

res = description_search(model, 'A big, scary, runout slab', routeID_key, df, topn=3)
parse_search_results(res)


Results for the following document:

"A big, scary, runout slab"

3 routes returned
------------------------------------------------------------------------------------------------------------------------
Name                                     Type    Grade ID        
Ed's Arete Right                         boulder V5    107084467 

DESCRIPTION (similarity = 0.784):
On the main, huge, imposing highball boulder in the cluster (just off of the road to the right).  This is the right aret
e of the main boulder face.  climb high prominent arete to the scary, sketchy, harder than it looks finish way off the d
eck.
------------------------------------------------------------------------------------------------------------------------
Name                                     Type    Grade ID        
Bone Club, The                           trad    5.10b 105858738 

DESCRIPTION (similarity = 0.766):
a steep, slick face with a long, scary runout up to the first bolt
--------------------------