In [67]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [68]:
climb = pd.read_pickle('../database/test_data/_climb')
vocab = pd.read_csv('../database/vocab.txt', header=None)[0].tolist()

In [69]:
climb.shape

(11378, 71)

In [70]:
text_segments = climb['description'].tolist()

In [71]:
# lemmatize, tokenize, vectorize text
tfidf = TfidfVectorizer(
    vocabulary=vocab,
    strip_accents = 'unicode', lowercase=True, ngram_range=(1,2),
    norm='l2', sublinear_tf=False, smooth_idf=True, use_idf=True
    )
X = tfidf.fit_transform(text_segments)
# project sparse vocab space into lower rank dense represenation
svd = TruncatedSVD(
    n_components=256, random_state=42
    )
low_rank_X = svd.fit_transform(X)
climb['dense_tfidf'] = [low_rank_X[i] for i in range(len(low_rank_X))]

In [72]:
# climb['dense_tfidf'][0]

In [74]:
def get_parent_datum(climb, col, depth=2):
    """ Lookup information from areas with specified depth
    Parents have depth -2 because self.href is last
    """
    collect = []
    for cmb in climb['href']:
        parent_href = climb.loc[cmb]['hierarchy']
        if len(parent_href) < depth:
            pdatum = float('NaN')
        else:
            pdatum = climb.loc[parent_href][col][-depth]
        collect.append(pdatum)
    return pd.Series(collect, index=climb.index)

In [90]:
climb['parent_dense_tfidf'] = get_parent_datum(climb, 'dense_tfidf')

In [91]:
def combine_dense_matrix(climb, route_description_weight = 1.618):
    """ Add dense representations of route and area descriptions together """
    collect = []
    for _, cmb in climb.iterrows():
        if isinstance(cmb['parent_dense_tfidf'],float):
            if isinstance(cmb['dense_tfidf'],float):
                collect.append(float('NaN'))
            else:
                collect.append(cmb['dense_tfidf'])
        else:
            area = cmb['parent_dense_tfidf']
            route = cmb['dense_tfidf'] * route_description_weight
            added = area + route * route_description_weight
            # L2 normalize again
            text_vector = added / norm(added, 2)
            collect.append(text_vector)
    return collect

In [95]:
climb['combined_dense_tfidf'] = combine_dense_matrix(climb)

In [9]:
low_rank_X = svd.fit_transform(X)

In [10]:
low_rank_X.shape

(11378, 256)

In [11]:
sum(svd.explained_variance_ratio_)

0.51413706598069409

In [12]:
sum(svd.explained_variance_ratio_)

0.51413706598069409

In [32]:
len(low_rank_X[0])

256

In [13]:
dirty_query = 'dean a big one juggy crimpy sunny'

In [14]:
def sanitize(dirty_query, vocab):
    """ Preprocess query string """
    query = dirty_query.strip().lower()
    unigrams = query.split(' ')
    bigrams = [ b[0]+' '+b[1] for l in [query] for b in zip(l.split(' ')[:-1], l.split(' ')[1:]) ]
    tokens = [ t for t in (unigrams + bigrams) if t in vocab ]
    return tokens

In [15]:
tokens = sanitize(dirty_query, vocab)

In [16]:
from scipy.linalg import norm
from collections import Counter

counts = dict(Counter(tokens))
    
text_vector = np.zeros((1, len(vocab)))
for word, Tf in counts.items():
    # multiply term frequency by inverse document frequency
    tfidf_score = float(Tf) * 1 #idf_lookup[word]
    # add score to this word's position in the array
    text_vector[0,vocab.index(word)] = tfidf_score
# L2 normalize
text_vector = text_vector / norm(text_vector, 2)
# cast as sparse
# sparse_query = csr_matrix(text_vector)
# return sparse_query


In [17]:
text_vector

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [18]:
text_vector.shape

(1, 2746)

In [19]:
text_vector[text_vector > 0]

array([ 0.5,  0.5,  0.5,  0.5])

In [20]:
comp = svd.components_

In [21]:
projected = np.dot(text_vector, comp.T)

In [22]:
scores = np.dot(projected, low_rank_X.T)

In [23]:
scores.shape

(1, 11378)

In [26]:
comp.shape

(256, 2746)