# Problem: Creating word embeddings for: `Sports & Outdoors Reviews` Dataset

In [1]:
!pip install gensim -q
!pip install python-Levenshtein -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.5/175.5 KB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import gensim
import pandas as pd

import gzip
from pathlib import Path
import shutil
from os import cpu_count

## Loading the Dataset

In [5]:
zip_path = Path("/content/reviews_Sports_and_Outdoors_5.json.gz")
dest_path = Path("/content/reviews_Sports_and_Outdoors_5.json.json")

if not dest_path.is_file():
    with gzip.open(zip_path, "rb") as zip_ref:
        print(f"[INFO] Unzipping dataset `{zip_path}` to `{dest_path}`...")
        with open(dest_path, "wb") as un_zip_ref:
            shutil.copyfileobj(zip_ref, un_zip_ref)

    print(f"[INFO] Dataset succesfully downloaded to `{dest_path}`...")
else:
    print(f"[INFO] Dataset `{dest_path}` alerady exists...")

[INFO] Unzipping dataset `/content/reviews_Sports_and_Outdoors_5.json.gz` to `/content/reviews_Sports_and_Outdoors_5.json.json`...
[INFO] Dataset succesfully downloaded to `/content/reviews_Sports_and_Outdoors_5.json.json`...


## Understanding the Dataset

In [7]:
# Converting the Dataset into a Pandas DataFrame
df = pd.read_json(dest_path, lines=True)

df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [8]:
# Getting the shape of the DataFrame
df.shape

(296337, 9)

In [9]:
# Getting the Pandas Series of the field we are interested
df.reviewText[0]

'This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy'

In [11]:
df.reviewText[9]

'Does everything it says it will do. I would like it so that the &#34;lane&#34; markings were a bit brighter on the ground. This does add a bit of safety to riding in the dark, as long as the motorists pay attention.'

## Data Preprocessing

In [12]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)

review_text

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

## Creating the Model

In [14]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=cpu_count()
)

In [18]:
# Loading the vocabulary
model.build_vocab(review_text, progress_per=1000)



## Training the Model

In [19]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(91341245, 121496535)

## Saving the Model

In [20]:
model.save("/content/word2vec_sports_outdoors_reviews.model")

## Evaluating the Model

In [21]:
model.wv.most_similar("awful")

[('terrible', 0.7192577123641968),
 ('horrible', 0.7065366506576538),
 ('ugly', 0.6341110467910767),
 ('enormous', 0.5930300354957581),
 ('overwhelming', 0.590324342250824),
 ('authentic', 0.5659762024879456),
 ('crappy', 0.5578809380531311),
 ('funny', 0.5471398234367371),
 ('insane', 0.5460387468338013),
 ('horrendous', 0.5356783270835876)]

In [23]:
model.wv.similarity(w1="good", w2="great")

0.777602

In [24]:
model.wv.similarity(w1="slow", w2="steady")

0.3711779