In [1]:
import pandas as pd
from django.utils.encoding import smart_str
from gensim.models.doc2vec import TaggedDocument
# to use this module follow https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK
from nltk.parse import CoreNLPParser
parser = CoreNLPParser(url='http://localhost:9000')
'''great library to parallelize: 
https://towardsdatascience.com/pandaral-lel-a-simple-and-efficient-tool-to-parallelize-your-pandas-operations-on-all-your-cpus-bb5ff2a409ae
https://github.com/nalepae/pandarallel'''  

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True,shm_size_mb=10000)
# load the data into panda dataframe
pth="/Users/vineeth/"
f_name = "reviews_Musical_Instruments.json"

raw_df = pd.read_json(pth+f_name, lines=True)
raw_df = raw_df.head(20000)
# if you want to load a csv file then do
# raw_df = pd.read_csv(data_file_name,encoding = "ISO-8859-1")
# View information about the data
raw_df.info()

New pandarallel memory created - Size: 10000 MB
Pandarallel will run on 12 workers
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 9 columns):
reviewerID        20000 non-null object
asin              20000 non-null object
reviewerName      19669 non-null object
helpful           20000 non-null object
reviewText        20000 non-null object
overall           20000 non-null int64
summary           20000 non-null object
unixReviewTime    20000 non-null int64
reviewTime        20000 non-null object
dtypes: int64(2), object(7)
memory usage: 1.4+ MB


In [2]:
raw_df.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1YS9MDZP93857,6428320,John Taylor,"[0, 0]",The portfolio is fine except for the fact that...,3,Parts missing,1394496000,"03 11, 2014"
1,A3TS466QBAWB9D,14072149,Silver Pencil,"[0, 0]",If you are a serious violin student on a budge...,5,"Perform it with a friend, today!",1370476800,"06 6, 2013"
2,A3BUDYITWUSIS7,41291905,joyce gabriel cornett,"[0, 0]",This is and excellent edition and perfectly tr...,5,Vivalldi's Four Seasons,1381708800,"10 14, 2013"


In [3]:
# import natural language toolkit
import nltk
# download the punkt tokenizer
nltk.download('punkt')
# Load the punkt tokenizer
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
print("The punkt tokenizer is loaded")

The punkt tokenizer is loaded


[nltk_data] Downloading package punkt to /Users/vineeth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
import re
# Clean and split sentence into words
def clean_and_split_str(row):
    desc = row['reviewText']
    ptn_n = row['reviewerID']
    strngs = tokenizer.tokenize(desc)
    # for Doc2Vec to work, we need to tag the sentences with ID, in our case ID is the reviewer ID
    strngs = [TaggedDocument(list(parser.tokenize(s.lower())),[ptn_n]) for s in strngs]
    return strngs
#     return [s.split() for s in strngs]

In [5]:
%%time
raw_df['reviewText_a'] = raw_df.parallel_apply(clean_and_split_str,axis=1)
raw_df.drop('reviewText',axis=1,inplace=True)
raw_df.rename({'reviewText_a':'reviewText'}, axis='columns',inplace=True)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1667), Label(value='0 / 1667'))), …

CPU times: user 4.82 s, sys: 1e+03 ms, total: 5.82 s
Wall time: 48.1 s


In [6]:
raw_df.to_pickle('processed_reviews.pkl')

<h2><center>Loading the processed dataframe and training the Doc2Vec Model.</center></h2>

<p><b>Note: Run the following blocks from scratch, if the pre-processed file already exists</b></p>


In [7]:
import pandas as pd
import multiprocessing

In [8]:
df = pd.read_pickle('processed_reviews.pkl')
df.head(5)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,overall,summary,unixReviewTime,reviewTime,reviewText
0,A1YS9MDZP93857,6428320,John Taylor,"[0, 0]",3,Parts missing,1394496000,"03 11, 2014","[([the, portfolio, is, fine, except, for, the,..."
1,A3TS466QBAWB9D,14072149,Silver Pencil,"[0, 0]",5,"Perform it with a friend, today!",1370476800,"06 6, 2013","[([if, you, are, a, serious, violin, student, ..."
2,A3BUDYITWUSIS7,41291905,joyce gabriel cornett,"[0, 0]",5,Vivalldi's Four Seasons,1381708800,"10 14, 2013","[([this, is, and, excellent, edition, and, per..."
3,A19K10Z0D2NTZK,41913574,TexasCowboy,"[0, 0]",5,Full score: voice and orchestra,1285200000,"09 23, 2010","[([perfect, for, someone, who, is, an, opera, ..."
4,A14X336IB4JD89,201891859,dfjm53,"[0, 1]",1,Unable to determine contents,1350432000,"10 17, 2012","[([how, many, nocturnes, does, it, contain, ?]..."


In [9]:
sentences = []
for r in df['reviewText']:
    sentences += r
print(sentences[0])

TaggedDocument(['the', 'portfolio', 'is', 'fine', 'except', 'for', 'the', 'fact', 'that', 'the', 'last', 'movement', 'of', 'sonata', '#', '6', 'is', 'missing', '.'], ['A1YS9MDZP93857'])


In [10]:
#Dimensionality of the resulting word vectors
num_features = 300

#Minimum word count threshold
min_word_count = 4

#Number of threads to run in parallel
num_workers = multiprocessing.cpu_count()

#Context window length
context_size = 4

#Seed for the RNG, to make the result reproducible
seed = 1

In [11]:
import gensim

doc2vec_model = gensim.models.doc2vec.Doc2Vec(
    seed=seed,
    workers=num_workers, 
    vector_size=num_features, 
    min_count=min_word_count,
    epochs = 3,
    window=context_size)

In [12]:
doc2vec_model.build_vocab(sentences)
print("The vocabword2vec_model.iterulary is built")
print("Word2Vec vocabulary length: ", len(doc2vec_model.wv.vocab))

The vocabword2vec_model.iterulary is built
Word2Vec vocabulary length:  21317


In [13]:
#Start training the model
doc2vec_model.train(documents=sentences,total_examples=doc2vec_model.corpus_count,epochs=doc2vec_model.epochs)
print("Training finished")

Training finished


In [14]:
#Save the model
doc2vec_model.save("reviews_ebeddings.d2v")
print("Model saved")

Model saved


<h1><center>Visualizing the output from doc2vec</center></h1>

<p>Note: Run the following block independently, if the doc2vec model is already trained and the results are stored.</p>

In [2]:
import gensim

In [15]:
# Load our word2vec model
d2v_model = gensim.models.doc2vec.Doc2Vec.load("reviews_ebeddings.d2v")
print("Model loaded")

Model loaded


In [16]:
# the total number of documents and their vectors (iteratable)
len(d2v_model.docvecs)

17531

In [17]:
for i in range(len(d2v_model.docvecs)):
    print(type(d2v_model.docvecs[i]))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.nd

In [18]:
# the number of words 
len(d2v_model.wv.vocab)

21317

In [19]:
d2v_model.most_similar("camera")

  """Entry point for launching an IPython kernel.


[('camcorder', 0.9152639508247375),
 ('computer', 0.914709210395813),
 ('amp', 0.9029631614685059),
 ('plug', 0.8744760751724243),
 ('receiver', 0.8733190298080444),
 ('system', 0.8730030059814453),
 ('pc', 0.8691247701644897),
 ('board', 0.8643085956573486),
 ('pedal', 0.8578853607177734),
 ('needle', 0.8534005880355835)]

In [20]:
# the document tags or the ids of each document for lookup
d2v_model.docvecs.doctags

{'A1YS9MDZP93857': Doctag(offset=0, word_count=24, doc_count=2),
 'A3TS466QBAWB9D': Doctag(offset=1, word_count=107, doc_count=4),
 'A3BUDYITWUSIS7': Doctag(offset=2, word_count=34, doc_count=3),
 'A19K10Z0D2NTZK': Doctag(offset=3, word_count=145, doc_count=4),
 'A14X336IB4JD89': Doctag(offset=4, word_count=27, doc_count=3),
 'A2HR0IL3TC4CKL': Doctag(offset=5, word_count=8742, doc_count=238),
 'A2DHYD72O52WS5': Doctag(offset=6, word_count=162, doc_count=8),
 'A1MUVHT8BONL5K': Doctag(offset=7, word_count=100, doc_count=10),
 'A15GZQZWKG6KZM': Doctag(offset=8, word_count=30, doc_count=3),
 'A16WE7UU0QD33D': Doctag(offset=9, word_count=77, doc_count=6),
 'AXMWZYP2IROMP': Doctag(offset=10, word_count=261, doc_count=7),
 'A6DCKXX4659CR': Doctag(offset=11, word_count=247, doc_count=14),
 'A28YJZCV43ZWQW': Doctag(offset=12, word_count=201, doc_count=3),
 'A2I4CV4PGZCNAF': Doctag(offset=13, word_count=98, doc_count=4),
 'A1M7OWNI93N1Y3': Doctag(offset=14, word_count=189, doc_count=4),
 'A2FEDD