In [1]:
import re
import math
import string
import pickle
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from gensim.models import Word2Vec
from gensim.models.phrases import Phraser
from gensim.parsing.preprocessing import remove_stopwords
from numpy.linalg import norm
from sklearn.preprocessing import MinMaxScaler



In [6]:
model = Word2Vec.load('word2vec.model')
bigram_mod = Phraser.load('bigram.model')
trigram_mod = Phraser.load('trigram.model')
df = pd.read_pickle('Curated_OpenBetaAug2020_RytherAnderson.pkl.zip', compression='zip')

In [2]:
def cosine_similarity(vec0, vec1):
    
    return np.dot(vec0, vec1)/(norm(vec0)*norm(vec1))

def get_route_desc_words(routeID, df, bigram_mod, trigram_mod):
    
    routeID = str(routeID)
    desc = list(df.loc[df.route_ID == routeID, 'description'])[0]
    desc = ' '.join(desc)
    desc = str(desc).lower()  # lowercase
    desc = remove_stopwords(desc)
    desc = re.sub(r'\s+', ' ', desc)  # multiple spaces converted to single spaces
    desc = re.sub('[0-9]', '', desc)  # remove digits
    desc = re.sub(r'(?<=\w)-(?=\w)', ' ', desc)  # dash replaced with space
    desc = re.sub(f'[{re.escape(string.punctuation)}]', '', desc)
    
    sentence = desc.split() # split and construct bigrams and trigrams
    sentence = bigram_mod[sentence]
    sentence = trigram_mod[bigram_mod[sentence]]
    
    return sentence

def get_word_similarity(word0, word1, model):
    
    vec0 = model.wv.get_vector(word0)
    vec1 = model.wv.get_vector(word1)
    
    return cosine_similarity(vec0, vec1)

def desc_similarity2words(words, desc, model):
    
    sim_dict = dict((w, []) for w in words)
    for word in words:
        
        vec0 = model.wv.get_vector(word) 
        for desc_word in desc:
            
            try:
                vec1 = model.wv.get_vector(desc_word)
            except KeyError:
                continue
                
            cosine_sim = cosine_similarity(vec0, vec1)
            sim_dict[word].append(cosine_sim)
    
    sim_df = []
    for word, sims in sim_dict.items():
        line = [word, np.max(sims)]
        sim_df.append(line)
        
    sim_df = pd.DataFrame(sim_df, columns=['key_word', 'agg_sim'])
    
    return sim_df

def get_closest_words2avg_desc(desc, model, N=10):
    
    used_words = []
    for word in desc:
        try:
            vec = model.wv.get_vector(word)
            used_words.append(word)
        except KeyError:
            continue
            
    closest = model.wv.most_similar(positive=used_words, topn=N)
    
    return closest

In [3]:
all_words = [
             # general descriptors
             'dynamic',
             'powerful',
             'technical',
             'sustained',
             'pump',
             'body_tension',
             'scary',
             'chossy',
             'fun',
             
             # rock features
             'dihedral',
             'arete',
             'roof',
             'slab',
             'crack'
             ]

# all_word_sims = []
# for route_ID in df.route_ID:
#     
#     desc_words = get_route_desc_words(route_ID, df, bigram_mod, trigram_mod)
#     
#     if len(desc_words) > 10:
#         sims = desc_similarity2words(all_words, desc_words, model)
#         all_word_sims.append(sims.agg_sim.values)
#     
# avg_word_sims = np.average(np.array(all_word_sims), axis=0)
# avg_word_sims = pd.DataFrame({'key_word': all_words, 'avg_agg_sim': avg_word_sims})
# avg_word_sims.to_csv('avg_word_sims.csv')

avg_word_sims = pd.read_csv('avg_word_sims.csv')

In [45]:
def route_keyword_distance_deviations(route_ID, avg_distances, df, model, words):
    
    desc_words = get_route_desc_words(route_ID, df, bigram_mod, trigram_mod)
    sims = desc_similarity2words(words, desc_words, model)
    sims = sims.merge(avg_distances, on='key_word')
    sims['deviation_from_avg'] = sims.agg_sim - sims.avg_agg_sim
    sims['deviation_from_avg'] = sims.deviation_from_avg - sims.deviation_from_avg.min() + 0.1
    
    return sims[['key_word', 'deviation_from_avg']].copy()

general_descriptors = [
    'dynamic',
    'powerful',
    'technical',
    'sustained',
    'pump',
    'body_tension',
    'scary',
    'chossy',
    'fun']

rock_features = [
    'dihedral',
    'arete',
    'roof',
    'slab',
    'crack']
    
route_keyword_distance_deviations(106812289, avg_word_sims, df, model, rock_features)

Unnamed: 0,key_word,deviation_from_avg
0,dihedral,0.25785
1,arete,0.284088
2,roof,0.646592
3,slab,0.63701
4,crack,0.1


In [55]:
def multi_route_radar_plot(route_IDs, df, model, comp_words, label_map):
    
    fig = go.Figure()
    avg_word_sims = pd.read_csv('avg_word_sims.csv')
    route_IDs = [str(rID) for rID in route_IDs]
    
    for rID in route_IDs:
        
        sims = route_keyword_distance_deviations(rID, avg_word_sims, df, model, comp_words)
        rounded = [np.round(x, 2) for x in sims.deviation_from_avg]
        
        for key_word, label in label_map.items():
            sims.loc[sims.key_word == key_word, 'key_word'] = label
            
        rdf = df[df.route_ID == rID]
        name = rdf.route_name.values[0]
        YDS = rdf.nopm_YDS.values[0] if rdf.nopm_YDS.values[0] else ''
        Vermin = rdf.nopm_Vermin.values[0] if rdf.nopm_Vermin.values[0] else ''
        sims['route_name'] = rdf.route_name.values[0]

        if len(YDS) == 0 and len(Vermin) == 0:
            legend_label = f'{name} ({YDS}/{Vermin})'
        elif len(YDS) != 0 and len(Vermin) == 0:
            legend_label = f'{name} ({YDS})'
        elif len(YDS) == 0 and len(Vermin) != 0:
            legend_label = f'{name} ({Vermin})'
        else:
            legend_label = name
            
        fig.add_trace(
            go.Scatterpolar(
                r=sims.deviation_from_avg,
                fill='toself',
                theta=sims.key_word,
                customdata=np.c_[sims.route_name, sims.key_word, rounded],
                hoverlabel=dict(font_size=12, font_family='Arial', bgcolor='white'),
                hovertemplate=
                    '<b>%{customdata[0]}</b><br>' +
                    '%{customdata[1]} = ' +
                    '%{customdata[2]}' +
                    '<extra></extra>',
                name=legend_label,
            )
        )
        
    fig.update_layout(margin=go.layout.Margin(l=1, r=1, b=25, t=25, pad=5))
    fig.update_layout(polar=dict(radialaxis=dict(showticklabels=False, ticks='', linewidth=0)))
    fig.update_layout(template='gridon')
    
    return fig

def route_profile_plot(route_ID, df, model, comp_words, label_map):
    
    route_ID = str(route_ID)
    sims = route_keyword_distance_deviations(route_ID, avg_word_sims, df, model, comp_words)
    rounded = [np.round(x, 2) for x in sims.deviation_from_avg]
    
    for key_word, label in label_map.items():
        sims.loc[sims.key_word == key_word, 'key_word'] = label
        
    rdf = df[df.route_ID == route_ID]
    name = rdf.route_name.values[0]
    yds = rdf.nopm_YDS.values[0] if rdf.nopm_YDS.values[0] else ''
    vermin = rdf.nopm_Vermin.values[0] if rdf.nopm_Vermin.values[0] else ''
    sims['route_name'] = rdf.route_name.values[0]
    
    if len(yds) == 0 and len(vermin) == 0:
        legend_label = f'{name} ({yds}/{vermin})'
    elif len(yds) != 0 and len(vermin) == 0:
        legend_label = f'{name} ({yds})'
    elif len(yds) == 0 and len(vermin) != 0:
        legend_label = f'{name} ({vermin})'
    else:
        legend_label = name
    
    fig = go.Figure()
    fig.add_trace(
        go.Barpolar(
            r=sims.deviation_from_avg,
            theta=sims.key_word,
            customdata=np.c_[sims.key_word, rounded],
            hoverlabel=dict(font_size=12, font_family='Arial', bgcolor='white'),
            hovertemplate=
                '%{customdata[0]} = ' +
                '%{customdata[1]}' +
                '<extra></extra>',
            name=legend_label,
            marker=dict(color=sims.deviation_from_avg, colorscale='viridis')
        )
    )
        
    fig.update_layout(margin=go.layout.Margin(l=1, r=1, b=25, t=25, pad=5), template='gridon')
    fig.update_layout(polar=dict(radialaxis=dict(showticklabels=False, ticks='', linewidth=0)))
    fig.update_layout(title={
        'text': '<b>' + sims['route_name'].unique()[0],
        'y':0.98,
        'x':0.40,
        'xanchor': 'center',
        'yanchor': 'top'})
    
    return fig


In [59]:
# 114776484 No Excuse
# 106811748 Choke Cherry Eyes
# 113170173 Whipped Cream
# 107381798	Deepwater Horizon
# 105762645 Childhood's End
# 105749158 Sonic Youth
# 105756778	The Scenic Cruise
# 105758647 Fission aka Ken T'ank
# 105749797	Y2K
# 112095476	Backdoor Man
# 106812289 Double Stout
# 105749530 Skimbleshanks
# 106980056 The Rodeo
# 107892801 Chuckles
# 105753859 Beer Run

IDs = [114776484, 106812289, 113170173]

label_map = {
    'dynamic': 'Dynamic',
    'powerful': 'Powerful',
    'technical': 'Technical',
    'sustained': 'Sustained',
    'pump': 'Pumpy',
    'body_tension': 'Body Tension',
    'scary': 'Scary',
    'chossy': 'Chossy',
    'fun': 'Fun',
}

# multi_route_radar_plot(IDs, df, model, general_descriptors, label_map)
route_profile_plot(105753859, df, model, general_descriptors, label_map)

