## Dependencies

In [3]:
# !pip install gensim
# !pip install python-Levenshtein

In [4]:
import gensim
import pandas as pd

In [5]:
df = pd.read_json("Cell_Phones_and_Accessories_5.json",lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [6]:
df['reviewText'].values

array(["They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again",
       'These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)',
       'These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!',
       ...,
       "This is a great cable, just as good as the more expensive Apple one. My husband and my daughter both lost their original cables for their iPhones and they went the cheap route, buying a cheap replacement cable, I believe it messed up their battery memory because coincidentally both of them have battery hold/charge issues but my son and I do not. I used this cable to charge my husbands phone and it charged it from 3% to 100% in a

## Simple processing and Tokenization

For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

In [7]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)

In [8]:
review_text[0]

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

## Train the Word2Vec model

### Model Initialization

In [10]:
model = gensim.models.Word2Vec(
    window = 10,
    min_count=2,
    workers=4
    
)

### Build Vocabulary

In [11]:
model.build_vocab(review_text,progress_per=1000)

## Train the Word2Vec Model

In [13]:
model.corpus_count

194439

In [14]:
df.shape

(194439, 9)

In [15]:
model.epochs

5

In [16]:
model.train(review_text, total_examples=model.corpus_count,epochs=model.epochs)

(61506042, 83868975)

## Save the Model

In [17]:
model.save("./word2vec_amazon_review.model")

## Loading the Model

In [None]:
# model = gensim.models.Word2Vec.load("word2vec_amazon_review.model")

## Finding the Similar Word

In [27]:
model.wv.most_similar('awful',topn=10)

[('horrible', 0.8393529057502747),
 ('terrible', 0.8313641548156738),
 ('atrocious', 0.6274282932281494),
 ('amazing', 0.5943576097488403),
 ('poor', 0.5812827348709106),
 ('crappy', 0.581173837184906),
 ('alright', 0.5682294964790344),
 ('lousy', 0.53962641954422),
 ('ok', 0.5381813049316406),
 ('unbelievable', 0.5364510416984558)]

In [25]:
model.wv.similarity(w1='good',w2='bad')

0.58091694

In [26]:
model.wv.similarity(w1='good',w2='great')

0.79002094