In [1]:
import gensim
from gensim.models import Word2Vec
import pandas as pd

In [2]:
df = pd.read_parquet("../data/sandiego_reviews.parquet")


In [3]:
corpus = [text.lower().split() for text in df['text']]


In [4]:
model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=2, workers=4)


In [5]:
print(model.wv.most_similar("taco")) 


[('tacos', 0.6948161721229553), ('taco.', 0.6561151742935181), ('tacos.', 0.6001890897750854), ('burrito', 0.5994494557380676), ('taco,', 0.5960773825645447), ('vibration', 0.5529083013534546), ('taco!', 0.5506736040115356), ('mar', 0.5394843220710754), ('butcher', 0.5368792414665222), ('friscos', 0.5367324352264404)]


In [6]:
model.save("../data/review_embedding.w2v")


In [10]:
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
import numpy as np

In [11]:
pivot_df = df.pivot_table(index='user_id', columns='place_name', values='rating').fillna(0)
R = pivot_df.values
user_ids = list(pivot_df.index)
place_names = list(pivot_df.columns)

In [12]:
R_mean = np.mean(R, axis=1)
R_demeaned = R - R_mean.reshape(-1, 1)

In [13]:
U, sigma, Vt = svds(csr_matrix(R_demeaned), k=50)
sigma = np.diag(sigma)

In [14]:
# 4. Save the Matrices
np.save("../data/U.npy", U)
np.save("../data/sigma.npy", sigma)
np.save("../data/Vt.npy", Vt)
np.save("../data/user_ids.npy", user_ids)
np.save("../data/place_names.npy", place_names)

In [16]:
import implicit
import scipy.sparse as sparse

In [17]:
users = df['user_id'].astype("category")
items = df['place_name'].astype("category")

In [18]:
item_user_data = sparse.csr_matrix((np.ones(len(df)), (items.cat.codes, users.cat.codes)))


In [19]:
model = implicit.bpr.BayesianPersonalizedRanking(factors=50)
model.fit(item_user_data)

  0%|          | 0/100 [00:00<?, ?it/s]

In [20]:
np.save("../data/bpr_item_factors.npy", model.item_factors)
np.save("../data/bpr_user_factors.npy", model.user_factors)