In [2]:
import gensim
import pandas as pd

## Load and Explore dataset

Dataset used is subset of Amazon reviews from Sports and Outdoors category. Stored as JSON file, can be read using Pandas
Link: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz

In [4]:
df=pd.read_json("reviews_Sports_and_Outdoors_5.json",lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [5]:
df.shape

(296337, 9)

## Simple Preprocessing

Use Gensims simple_preprocess method in utils to preprocess the dataset, which includes converting all words to lower case, removing punctuations, trimming spaces, also remove stop words like 'and','or' etc.

In [6]:
review_text=df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

In [9]:
df.reviewText.loc[0]

'This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy'

## Training the model
Use a window of size 10, workers of 4 which defines how many CPU threads to be used

In [10]:
model=gensim.models.Word2Vec(
    window=10,  # Size of the context window
    min_count=2,  # Minimum word frequency to consider
    workers=4,  # Number of worker threads to train the model
)

In [11]:
model.build_vocab(review_text,progress_per=1000)

In [12]:
model.train(review_text,total_examples=model.corpus_count,epochs=model.epochs)

(91341382, 121496535)

In [13]:
model.wv.most_similar("awful")

[('terrible', 0.6990038156509399),
 ('horrible', 0.6618235111236572),
 ('overpowering', 0.6190665364265442),
 ('ugly', 0.6158244609832764),
 ('horrendous', 0.6052393317222595),
 ('unpleasant', 0.5990537405014038),
 ('authentic', 0.5955678820610046),
 ('funny', 0.584703803062439),
 ('anomaly', 0.5706218481063843),
 ('icky', 0.5643571615219116)]

In [14]:
model.wv.similarity("good","great")

0.77878237

In [15]:
model.wv.similarity("slow","steady")

0.33882117