## Implement word2vec in gensim

In [1]:
! pip install gensim
! pip install python-Levenshtein

Collecting gensim
  Downloading gensim-4.3.0-cp38-cp38-win_amd64.whl (24.0 MB)
Collecting Cython==0.29.32
  Downloading Cython-0.29.32-py2.py3-none-any.whl (986 kB)
Collecting FuzzyTM>=0.4.0
  Downloading FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting scipy>=1.7.0
  Downloading scipy-1.10.0-cp38-cp38-win_amd64.whl (42.2 MB)
Collecting pyfume
  Downloading pyFUME-0.2.25-py3-none-any.whl (67 kB)
Collecting fst-pso
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
Collecting miniful
  Downloading miniful-0.0.6.tar.gz (2.8 kB)
Collecting simpful
  Downloading simpful-2.9.0-py3-none-any.whl (30 kB)
Building wheels for collected packages: fst-pso, miniful
  Building wheel for fst-pso (setup.py): started
  Building wheel for fst-pso (setup.py): finished with status 'done'
  Created wheel for fst-pso: filename=fst_pso-1.8.1-py3-none-any.whl size=20441 sha256=b4faad93c117032fb12cce09e877f850c155bf81d4d2d1f6257a890136475d99
  Stored in directory: c:\users\alanj\appdata\local\pip\cache\wheels\6a\65\c

In [2]:
import gensim
import pandas as pd


### Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Cell Phones & Accessories category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

In [12]:
df = pd.read_csv("Ecommerce_data.csv")
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [13]:
df.shape

(24000, 2)

### Simple Preprocessing & Tokenization

he first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [15]:
processed_text = df['Text'].apply(gensim.utils.simple_preprocess)

In [18]:
processed_text.loc[0]

['urban',
 'ladder',
 'eisner',
 'low',
 'back',
 'study',
 'office',
 'computer',
 'chair',
 'black',
 'study',
 'in',
 'simple',
 'the',
 'eisner',
 'study',
 'chair',
 'has',
 'firm',
 'foam',
 'cushion',
 'which',
 'makes',
 'long',
 'hours',
 'at',
 'your',
 'desk',
 'comfortable',
 'the',
 'flexible',
 'meshed',
 'back',
 'is',
 'designed',
 'for',
 'air',
 'circulation',
 'and',
 'support',
 'when',
 'you',
 'lean',
 'back',
 'the',
 'curved',
 'arms',
 'provide',
 'ergonomic',
 'forearm',
 'support',
 'adjust',
 'the',
 'height',
 'using',
 'the',
 'gas',
 'lift',
 'to',
 'find',
 'that',
 'comfortable',
 'position',
 'and',
 'the',
 'nylon',
 'castors',
 'make',
 'it',
 'easy',
 'to',
 'move',
 'around',
 'your',
 'space',
 'chrome',
 'legs',
 'refer',
 'to',
 'the',
 'images',
 'for',
 'dimension',
 'details',
 'any',
 'assembly',
 'required',
 'will',
 'be',
 'done',
 'by',
 'the',
 'ul',
 'team',
 'at',
 'the',
 'time',
 'of',
 'delivery',
 'indoor',
 'use',
 'only']

### Training the Word2Vec Model
Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be used.

#### Initialize the model

In [19]:
model = gensim.models.Word2Vec(window = 10, min_count=2, workers=4)

#### Build Vocabulary

In [21]:
model.build_vocab(processed_text, progress_per=1000)

#### Train the Word2Vec Model

In [22]:
model.train(processed_text, total_examples=model.corpus_count, epochs=model.epochs)

(10339222, 12358775)

### Save the Model
Save the model so that it can be reused in other applications

In [23]:
model.save("./word2vec-amazon-cell-accessories-reviews-short.model")

### Finding Similar Words and Similarity between words

In [25]:
model.wv.most_similar('electronics')

[('chargers', 0.7402549386024475),
 ('iphones', 0.7249327898025513),
 ('camcorders', 0.7193127870559692),
 ('equipment', 0.7189837694168091),
 ('amps', 0.7177767157554626),
 ('ipods', 0.7110496759414673),
 ('consoles', 0.7068403363227844),
 ('gadgets', 0.6885213851928711),
 ('jackly', 0.6818183064460754),
 ('mobiles', 0.6738800406455994)]

In [26]:
model.wv.most_similar('household')

[('decorating', 0.6579737663269043),
 ('beautifying', 0.6225135922431946),
 ('baskets', 0.6175703406333923),
 ('organizing', 0.6162499189376831),
 ('dinning', 0.6087204217910767),
 ('basket', 0.595137894153595),
 ('jam', 0.5899482369422913),
 ('bathrooms', 0.5875928997993469),
 ('cabins', 0.5875211358070374),
 ('decorative', 0.5859470367431641)]

In [27]:
model.wv.similarity(w1='decorating', w2='organizing')

0.5732143

In [29]:
model.wv.similarity(w1='house', w2='dinning')

0.26103097