# Implementing word2vec in gensim

In [1]:
#!pip install gensim
#!pip install python-Levenshtein

In [2]:
import gensim
import pandas as pd
import tensorflow as tf

# Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Cell Phones & Accessories category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df= pd.read_json("/content/drive/MyDrive/Colab Notebooks/Cell_Phones_and_Accessories_5.json", lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [5]:
df.shape

(194439, 9)

In [6]:
df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

Now we do pre processing for doing word2vec i.e. removing stop words like a, and ,the, it, was etc 

And converting all texts to lowercase, removing trailing spaces, punctuation marks

So for this we will use gemsin 

In [7]:
# example
gensim.utils.simple_preprocess("They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again")

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [8]:
review_text=df.reviewText.apply(gensim.utils.simple_preprocess)
review_text.head()

0    [they, look, good, and, stick, good, just, don...
1    [these, stickers, work, like, the, review, say...
2    [these, are, awesome, and, make, my, phone, lo...
3    [item, arrived, in, great, time, and, was, in,...
4    [awesome, stays, on, and, looks, great, can, b...
Name: reviewText, dtype: object


## Training the Word2Vec Model
Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be used.

In [9]:
model = gensim.models.Word2Vec(
    window=10,  
    min_count=2, 
    workers=4 
    
)

In [10]:
# building vocabulary
model.build_vocab(review_text,progress_per=1000)

In [11]:
# by deafult models have epoch =5
model.epochs

5

In [12]:
model.corpus_count #basically telling rows in our data to be considered for training

194439

In [13]:
# training model
model.train(review_text, total_examples= model.corpus_count, epochs=8)

(98409319, 134190360)

In [14]:
# Saving model 
model.save("word2vec-amazon-cell-accessories-reviews-short.model")

## Loading and Retraining

You can load the model and also you can train it again o with new data or continue to train it on old data if the model is saved.

In [15]:
#model = Word2Vec.load("word2vec.model")
#model.train([["hello", "world"]], total_examples=1, epochs=1)

## Finding Similar Words and Similarity between words

https://radimrehurek.com/gensim/models/word2vec.html

In [16]:
model.wv.most_similar("bad")

[('terrible', 0.6711597442626953),
 ('shabby', 0.6547224521636963),
 ('horrible', 0.5940072536468506),
 ('good', 0.580632209777832),
 ('funny', 0.5547348260879517),
 ('sad', 0.5392102003097534),
 ('awful', 0.5389664173126221),
 ('okay', 0.5284633636474609),
 ('disappointing', 0.5262618064880371),
 ('weird', 0.522144079208374)]

Not perfect but still powerful

In [17]:
model.wv.similarity(w1="cheap",w2="inexpensive") #calculate cosine similarity

0.5522058

In [18]:
model.wv.similarity(w1="great",w2="product")

-0.046115596

In [19]:
model.wv.similarity(w1="great",w2="awsome")

0.670774

In [20]:
model.wv.similarity(w1="great",w2="great")

1.0