In [2]:
import pandas as pd
# load the data into panda dataframe
data_file_name = "~/Dropbox/DataRepo/AmazonReviews_Musical_Instruments/reviews_Musical_Instruments.json"
raw_df = pd.read_json(data_file_name, lines=True)
# if you want to load a csv file then do
# raw_df = pd.read_csv(data_file_name,encoding = "ISO-8859-1")
print("Data loaded")

Data loaded


In [3]:
# View information about the data
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500176 entries, 0 to 500175
Data columns (total 9 columns):
reviewerID        500176 non-null object
asin              500176 non-null object
reviewerName      497590 non-null object
helpful           500176 non-null object
reviewText        500176 non-null object
overall           500176 non-null int64
summary           500176 non-null object
unixReviewTime    500176 non-null int64
reviewTime        500176 non-null object
dtypes: int64(2), object(7)
memory usage: 34.3+ MB


In [4]:
# Convert all the review text into a long string and print its length
raw_corpus = u''.join(raw_df['reviewText']+" ")
print("Raw Corpus contains {0:,} characters".format(len(raw_corpus)))

Raw Corpus contains 241,653,315 characters


We will be using Punkt Tokenizer as the first pre-processing step. Punkt tokenizer is like Sentence tokenizer, which essentially detectects bundaries in a setnence and splits them. More details can be found here: https://subscription.packtpub.com/book/application_development/9781782167853/1/ch01lvl1sec12/training-a-sentence-tokenizer

In [5]:
# import natural language toolkit
import nltk
# download the punkt tokenizer
nltk.download('punkt')
print("The punkt tokenizer is downloaded")

The punkt tokenizer is downloaded


[nltk_data] Downloading package punkt to /home/vineeth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Load the punkt tokenizer
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
print("The punkt tokenizer is loaded")
# we tokenize the raw string into raw sentences
raw_sentences = tokenizer.tokenize(raw_corpus)
print("We have {0:,} raw sentences".format(len(raw_sentences)))

The punkt tokenizer is loaded
We have 2,492,388 raw sentences


In [7]:
import re, tqdm
# Clean and split sentence into words
def clean_and_split_str(string):
    strip_special_chars = re.compile("[^A-Za-z]+")
    string = re.sub(strip_special_chars, " ", string)
    return string.strip().split()

In [8]:
# clean each raw sentences and build the list of sentences
sentences = []
for raw_sent in raw_sentences:
    if len(raw_sent) > 0:
        sentences.append(clean_and_split_str(raw_sent))
print("We have {0:,} clean sentences".format(len(sentences)))

We have 2,492,388 clean sentences


In [9]:
sentences[10]

['Aida',
 'was',
 'first',
 'performed',
 'at',
 'the',
 'Khedivial',
 'Opera',
 'House',
 'in',
 'Cairo',
 'on',
 'December',
 'conducted',
 'by',
 'Giovanni',
 'Bottesini',
 'Overview',
 'Aida',
 'an',
 'Ethiopian',
 'princess',
 'is',
 'captured',
 'and',
 'brought',
 'into',
 'slavery',
 'in',
 'Egypt']

In [10]:
print(raw_sentences[20])
print()
print(sentences[20])

.the music slowed at times for structural rigour.. .

['the', 'music', 'slowed', 'at', 'times', 'for', 'structural', 'rigour']


In [12]:
token_count = sum([len(sentence) for sentence in sentences])
print("The dataset corpus contains {0:,} tokens".format(token_count))

The dataset corpus contains 44,962,028 tokens


In [13]:
import multiprocessing

#Dimensionality of the resulting word vectors
num_features = 300

#Minimum word count threshold
min_word_count = 4

#Number of threads to run in parallel
num_workers = multiprocessing.cpu_count()

#Context window length
context_size = 4

#Seed for the RNG, to make the result reproducible
seed = 1

In [15]:
import gensim

doc2vec_model = gensim.models.doc2vec.Doc2Vec(
    seed=seed,
    workers=num_workers, 
    vector_size=num_features, 
    min_count=min_word_count,
    epochs = 10
    window=context_size)

In [16]:
# for Doc2Vec to work, we need to tag the sentences with their line number
sentences = [gensim.models.doc2vec.TaggedDocument(s,[i]) for i,s in enumerate(sentences)]

In [17]:
sentences[0]

TaggedDocument(words=['The', 'portfolio', 'is', 'fine', 'except', 'for', 'the', 'fact', 'that', 'the', 'last', 'movement', 'of', 'sonata', 'is', 'missing'], tags=[0])

In [18]:
doc2vec_model.build_vocab(sentences)
print("The vocabword2vec_model.iterulary is built")
print("Word2Vec vocabulary length: ", len(doc2vec_model.wv.vocab))

The vocabword2vec_model.iterulary is built
Word2Vec vocabulary length:  80326


In [20]:
#Start training the model
doc2vec_model.train(documents=sentences,total_examples=doc2vec_model.corpus_count,epochs=doc2vec_model.epochs)
print("Training finished")

Training finished


In [23]:
#Save the model
doc2vec_model.save("reviews_Musical_Instruments.d2v")
print("Model saved")

Model saved
