In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import string
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec



In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yellow_flash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
business_file = "final_business_CA.gzip"
reviews_file = "final_review_CA.gzip"
users_file = "final_data_user_yelp.gzip"

In [4]:
reviews_df = pd.read_pickle(reviews_file)

In [7]:
train_df, test_df = train_test_split(reviews_df, train_size=0.8)

In [36]:
def text_preprocess(review):
    punctuation = set(string.punctuation)
    stemmer = PorterStemmer()
    stop_W = stopwords.words("english")
    r = ''.join([c for c in review.lower() if (not c in punctuation)])
    word_list = []
    for w in r.split():
        w = stemmer.stem(w)
        if (w not in stop_W) and w.isalpha():
            word_list.append(w)
    return word_list

In [37]:
review_list = []
for _, df in tqdm(train_df.iterrows()):
    review_list.append(text_preprocess(df['text']))

169398it [02:43, 1036.66it/s]


In [44]:
model = Word2Vec(review_list,
                 min_count=5, # Words/items with fewer instances are discarded
                 vector_size=10, # Model dimensionality
                 window=3, # Window size
                 sg=1,
                 workers=6,
                 epochs=5,
                 compute_loss=True) # Skip-gram model

In [45]:
model.wv.similar_by_word("pizza")


[('sub', 0.9723308086395264),
 ('calzon', 0.960164487361908),
 ('hamburg', 0.9471043944358826),
 ('tristrami', 0.9453326463699341),
 ('gyro', 0.9428593516349792),
 ('burger', 0.9404435157775879),
 ('sammi', 0.9395216107368469),
 ('karma', 0.9375873804092407),
 ('firehous', 0.9361119270324707),
 ('guru', 0.9341705441474915)]

In [46]:
model.get_latest_training_loss()

39131900.0

In [80]:
model2 = Word2Vec(review_list,
                 min_count=5, # Words/items with fewer instances are discarded
                 vector_size=50, # Model dimensionality
                 window=3, # Window size
                 sg=1,
                 workers=6,
                 epochs=30,
                 compute_loss=True) # Skip-gram model

In [81]:
model2.get_latest_training_loss()

70137488.0

In [50]:
39131900.0 - 68367312.0

-29235412.0

In [87]:
model2.wv.similar_by_word("pizza")


[('pepperoni', 0.8807622790336609),
 ('margherita', 0.8415851593017578),
 ('calzon', 0.8408629298210144),
 ('rusti', 0.8264104723930359),
 ('giovanni', 0.8056952953338623),
 ('domino', 0.7669126391410828),
 ('patxi', 0.761646568775177),
 ('diavola', 0.7613298892974854),
 ('gino', 0.755900502204895),
 ('umbra', 0.7541335225105286)]

In [86]:
model2.save("word2vec.model")