In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import string
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from tqdm import tqdm
from gensim.models import Word2Vec



In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yellow_flash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
model = Word2Vec.load("word2vec.model")

In [4]:
business_file = "final_business_CA.gzip"
reviews_file = "final_review_CA.gzip"
users_file = "final_data_user_yelp.gzip"

In [5]:
### Do data preprocessing containing dictionary of user, businesses and star ratings
reviews_df = pd.read_pickle(reviews_file)

In [6]:
users_review_df = reviews_df.groupby('user_id')[['text', 'date']].agg(list).reset_index()
business_review_df = reviews_df.groupby('business_id')[['text', 'date']].agg(list).reset_index()

In [7]:
def text_preprocess(review):
    punctuation = set(string.punctuation)
    stemmer = PorterStemmer()
    stop_W = stopwords.words("english")
    r = ''.join([c for c in review.lower() if (not c in punctuation)])
    word_list = []
    for w in r.split():
        w = stemmer.stem(w)
        if (w not in stop_W) and w.isalpha() and (w in model.wv.key_to_index):
            word_list.append(w)
    return word_list

In [11]:
def getSimMatrix(reviews_list):
    sim_matrix = []
    for review in reviews_list:
        tokens = text_preprocess(review)
        sim = []
        for t in tokens:
            sim.append(model.wv[t])
        if sim:
            sim_array = np.mean(np.array(sim), axis=0)
            sim_matrix.append(sim_array)
        else:
            continue
    if sim_matrix:
        return np.mean(np.array(sim_matrix), axis=0)
    else:
        return np.zeros((50,))

In [12]:
user_sim_df = {'user_id': [], 'review_w2v': []}
business_sim_df = {'business_id': [], 'review_w2v': []}

In [13]:
for _, df in tqdm(users_review_df.iterrows()):
    user_id = df['user_id']
    user_corpus = df['text']
    user_w2v = getSimMatrix(user_corpus)
    user_sim_df['user_id'].append(user_id)
    user_sim_df['review_w2v'].append(user_w2v)

106529it [04:03, 437.77it/s]


In [14]:
for _, df in tqdm(business_review_df.iterrows()):
    business_id = df['business_id']
    business_corpus = df['text']
    business_w2v = getSimMatrix(business_corpus)
    business_sim_df['business_id'].append(business_id)
    business_sim_df['review_w2v'].append(business_w2v)

0it [00:00, ?it/s]

1161it [03:57,  4.90it/s]


In [15]:
user_sim_df = pd.DataFrame(user_sim_df)
business_sim_df = pd.DataFrame(business_sim_df)

In [16]:
user_sim_df.to_pickle("user_w2v_summary.gzip")
business_sim_df.to_pickle("business_w2v_summary.gzip")

In [186]:
w2v_df = {'user_id': [], 'business_id': [], 'w2v_hours_matrix': [], 'ratings': []}

In [188]:
for idx, df in tqdm(reviews_df.iterrows()):
    user_id = df['user_id']
    business_id = df['business_id']
    user_w2v = np.array(user_sim_df[user_sim_df['user_id'] == user_id]['review_w2v'].tolist()[0])
    business_w2v = np.array(business_sim_df[business_sim_df['business_id'] == business_id]['review_w2v'].tolist()[0])
    
    similarity_matrix = (user_w2v @ business_w2v.T) / (np.linalg.norm(user_w2v) * np.linalg.norm(business_w2v))
    user_review_hours = [(pd.Timestamp.now() - timestamp) / pd.Timedelta(days=1) for timestamp in
                         users_review_df[users_review_df['user_id'] == user_id]['date'].tolist()[0]]
    business_review_hours = [(pd.Timestamp.now() - timestamp) / pd.Timedelta(days=1) for timestamp in
                            business_review_df[business_review_df['business_id'] == business_id]['date'].tolist()[0]]
    hours_array = np.transpose([np.tile(user_review_hours, len(business_review_hours)),
                                np.repeat(business_review_hours, len(user_review_hours))]).reshape((len(user_review_hours), len(business_review_hours), 2))
    hours_array = 1 / (1 - np.exp(-0.5 * np.log10(hours_array.reshape(-1, 2))))
    hours_array = hours_array[:, 0] * hours_array[:, 1]
    w2v_feature = similarity_matrix.reshape(-1,) * hours_array
    w2v_feature = np.array(sorted(w2v_feature, key=lambda x: np.abs(x))[-10:])
    final_w2v_feature = np.zeros((10,))
    final_w2v_feature[:min(10, w2v_feature.shape[0]),] = w2v_feature
    w2v_df["user_id"].append(user_id)
    w2v_df["business_id"].append(business_id)
    w2v_df["w2v_hours_matrix"].append(final_w2v_feature)
    w2v_df['ratings'].append(df['stars'])

  similarity_matrix = (user_w2v @ business_w2v.T) / (np.linalg.norm(user_w2v) * np.linalg.norm(business_w2v))
  similarity_matrix = (user_w2v @ business_w2v.T) / (np.linalg.norm(user_w2v) * np.linalg.norm(business_w2v))
211748it [1:44:48, 33.67it/s]


In [190]:
w2v_df = pd.DataFrame(w2v_df)

In [175]:
w2v_df[:50000].to_pickle("final_w2v1.gzip")

In [176]:
w2v_df[50000:100000].to_pickle("final_w2v2.gzip")

In [177]:
w2v_df[100000:150000].to_pickle("final_w2v3.gzip")

In [178]:
w2v_df[150000:].to_pickle("final_w2v4.gzip")

In [183]:
w2v_df[2099:2101].w2v_hours_matrix.tolist()

[array([0.03524676, 0.03560305, 0.03573325, 0.03595728, 0.03678525,
        0.03690095, 0.03740086, 0.0374249 , 0.03798524, 0.03809069]),
 array([0.0499155 , 0.05004904, 0.05013211, 0.05112213, 0.05139826,
        0.05142818, 0.05224272, 0.05244588, 0.05287784, 0.05312572])]

In [184]:
len(w2v_df)

211748

In [191]:
w2v_df.to_pickle("final_w2v_features2.gzip")

In [192]:
len(w2v_df)

211748

In [206]:
w2v_df.iloc[100000]['w2v_hours_matrix'].tolist()

[0.06879029730030624,
 0.07007109978941706,
 0.07328369822735556,
 0.07372292681640114,
 0.07434763562707646,
 0.07570043775457312,
 0.07810570014459159,
 0.0861519620202005,
 0.08724020138603725,
 0.09981738713681183]