In [30]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [102]:
climb = pd.read_pickle('scraping/test_data/_climb')
vocab = pd.read_csv('scraping/vocab.txt', header=None)[0].tolist()

In [103]:
climb.shape

(11378, 71)

In [104]:
text_segments = climb['description'].tolist()

In [105]:
# lemmatize, tokenize, vectorize text
tfidf = TfidfVectorizer(
    vocabulary=vocab,
    strip_accents = 'unicode', lowercase=True, ngram_range=(1,2),
    norm='l2', sublinear_tf=False, smooth_idf=True, use_idf=True
    )
X = tfidf.fit_transform(text_segments)

In [106]:
svd = TruncatedSVD(n_components=800, random_state=42)

In [107]:
low_rank_X = svd.fit_transform(X)

In [108]:
low_rank_X.shape

(11378, 800)

In [109]:
sum(svd.explained_variance_ratio_)

0.78521762198907041

In [101]:
sum(svd.explained_variance_ratio_)

0.74190299820667194

In [23]:
dirty_query = 'dean a big one juggy crimpy sunny'

In [24]:
def sanitize(dirty_query, vocab):
    """ Preprocess query string """
    query = dirty_query.strip().lower()
    unigrams = query.split(' ')
    bigrams = [ b[0]+' '+b[1] for l in [query] for b in zip(l.split(' ')[:-1], l.split(' ')[1:]) ]
    tokens = [ t for t in (unigrams + bigrams) if t in vocab ]
    return tokens

In [28]:
tokens = sanitize(dirty_query, vocab)

In [31]:
from scipy.linalg import norm
from collections import Counter

counts = dict(Counter(tokens))
    
text_vector = np.zeros((1, len(vocab)))
for word, Tf in counts.items():
    # multiply term frequency by inverse document frequency
    tfidf_score = float(Tf) * 1 #idf_lookup[word]
    # add score to this word's position in the array
    text_vector[0,vocab.index(word)] = tfidf_score
# L2 normalize
text_vector = text_vector / norm(text_vector, 2)
# cast as sparse
# sparse_query = csr_matrix(text_vector)
# return sparse_query


In [32]:
text_vector

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [33]:
text_vector.shape

(1, 2746)

In [36]:
text_vector[text_vector > 0]

array([ 0.5,  0.5,  0.5,  0.5])

In [74]:
comp = svd.components_

In [77]:
projected = np.dot(text_vector, comp.T)

In [80]:
scores = np.dot(projected, low_rank_X.T)

In [81]:
scores.shape

(1, 11378)

In [69]:
text_vector2 = np.vstack((text_vector, text_vector))

In [70]:
(text_vector2 * comp.T).shape

ValueError: operands could not be broadcast together with shapes (2,2746) (2746,100) 

In [71]:
comp.shape

(100, 2746)