## Doc2Vec

In [1]:
#!pip install gensim==3.8.3

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


### (A) Load the data

In [3]:
import pandas as pd
import numpy as np
path = '/content/gdrive/My Drive/data/gensim'
pd.set_option('display.max_colwidth', -1)
data = pd.read_csv(path + "/ag_news_train.csv")
#train = train[train['Class Index']==3]

  pd.set_option('display.max_colwidth', -1)


In [4]:
train = list(data['Description'])
train_length = len(train)
print(train_length)

120000


In [5]:
from gensim.parsing.preprocessing import preprocess_string
train_tokenized = []
for doc in train:
   k = preprocess_string(doc)
   train_tokenized.append(k)

In [6]:
train_tokenized[0:3]

[['reuter',
  'short',
  'seller',
  'wall',
  'street',
  'dwindl',
  'band',
  'ultra',
  'cynic',
  'see',
  'green'],
 ['reuter',
  'privat',
  'invest',
  'firm',
  'carlyl',
  'group',
  'reput',
  'make',
  'time',
  'occasion',
  'controversi',
  'plai',
  'defens',
  'industri',
  'quietli',
  'place',
  'bet',
  'market'],
 ['reuter',
  'soar',
  'crude',
  'price',
  'plu',
  'worri',
  'economi',
  'outlook',
  'earn',
  'expect',
  'hang',
  'stock',
  'market',
  'week',
  'depth',
  'summer',
  'doldrum']]

- Remember to re-use the code from Chapter 2. Do not re-invent the wheel.

In [7]:
# Gensim Libraries
import gensim
import pprint as pp
import random
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### (B) Tag the data

In [8]:
# Print out the first three to see
train_tagged_data = [TaggedDocument(words=w, tags=[str(i)]) for i,w in enumerate(train_tokenized[0:3])]
pp.pprint(train_tagged_data)

[TaggedDocument(words=['reuter', 'short', 'seller', 'wall', 'street', 'dwindl', 'band', 'ultra', 'cynic', 'see', 'green'], tags=['0']),
 TaggedDocument(words=['reuter', 'privat', 'invest', 'firm', 'carlyl', 'group', 'reput', 'make', 'time', 'occasion', 'controversi', 'plai', 'defens', 'industri', 'quietli', 'place', 'bet', 'market'], tags=['1']),
 TaggedDocument(words=['reuter', 'soar', 'crude', 'price', 'plu', 'worri', 'economi', 'outlook', 'earn', 'expect', 'hang', 'stock', 'market', 'week', 'depth', 'summer', 'doldrum'], tags=['2'])]


In [9]:
# Apply to all records
train_tagged_data = [TaggedDocument(words=w, tags=[str(i)]) for i,w in enumerate(train_tokenized)]

### (C) Modeling

In [10]:
# The default min_count in gensim's Word2Vec is set to 5.
# If there is no word in the vocab with frequency greater than 4, the vocab will be empty. We will get an error message in the .build_vocab() step.
# Because our dataset is small, we will set 'min_count=1'.
model = gensim.models.doc2vec.Doc2Vec(vector_size=32, min_count=2, epochs=100)
model.build_vocab(train_tagged_data)

In [11]:
model.train(train_tagged_data, total_examples=model.corpus_count, epochs=100)

###(D) Save

In [15]:
# The model
model.save(path + "/doc2vec.model")

In [17]:
# The training data
import pickle
train_file = path + '/ag_news_train.pkl'
train_tokenized_file = path + '/ag_news_train_tokenized.pkl'

with open(train_file, "wb") as fp:
   pickle.dump(train, fp)
with open(train_tokenized_file, "wb") as fp:
   pickle.dump(train, fp)

#### (E) Production -- an existing article

In [19]:
import pandas as pd
import numpy as np
import pprint as pp
from gensim.models.doc2vec import Doc2Vec
from gensim.test.utils import get_tmpfile
fname = get_tmpfile(path + "/doc2vec.model")
model = Doc2Vec.load(fname)

In [22]:
import pickle
train_file = path + '/ag_news_train.pkl'
train_tokenized_file = path + '/ag_news_train_tokenized.pkl'

with open(train_file, "rb") as fp:
   train = pickle.load(fp)
with open(train_tokenized_file, "rb") as fp:
   train_tokenized = pickle.load(fp)

#### (E.2) Production -- a new article

In [23]:
take_one = 79
print(train[take_one])
similar_doc = model.docvecs.most_similar(take_one)
print('The top similar ones are:')
pp.pprint(similar_doc)

Reuters - A group of technology companies\including Texas Instruments Inc. (TXN.N), STMicroelectronics\(STM.PA) and Broadcom Corp. (BRCM.O), on Thursday said they\will propose a new wireless networking standard up to 10 times\the speed of the current generation.
The top similar ones are:
[('1153', 0.9342728853225708),
 ('57599', 0.7566302418708801),
 ('51164', 0.7441697120666504),
 ('7069', 0.7431583404541016),
 ('72914', 0.7323065400123596),
 ('117732', 0.726141095161438),
 ('109640', 0.7258638739585876),
 ('22875', 0.7141897678375244),
 ('93659', 0.6943594813346863),
 ('64110', 0.6815279722213745)]


  similar_doc = model.docvecs.most_similar(take_one)


In [24]:
for i in range(len(similar_doc)):
  id = similar_doc[i][0]
  print('Id:',id, 'the news is:', train[int(id)])

Id: 1153 the news is:  LOS ANGELES (Reuters) - A group of technology companies  including Texas Instruments Inc. &lt;A HREF="http://www.reuters.co.uk/financeQuoteLookup.jhtml?ticker=TXN.N qtype=sym infotype=info qcat=news"&gt;TXN.N&lt;/A&gt;, STMicroelectronics  &lt;A HREF="http://www.reuters.co.uk/financeQuoteLookup.jhtml?ticker=STM.PA qtype=sym infotype=info qcat=news"&gt;STM.PA&lt;/A&gt; and Broadcom Corp. &lt;A HREF="http://www.reuters.co.uk/financeQuoteLookup.jhtml?ticker=BRCM.O qtype=sym infotype=info qcat=news"&gt;BRCM.O&lt;/A&gt;, on Thursday said they  will propose a new wireless networking standard up to 10 times  the speed of the current generation.
Id: 57599 the news is: Reuters - The decision by British authorities\to suspend the license of a major influenza vaccine\manufacturer just before the vaccine was to be shipped shows a\need for better communication, acting FDA chief Dr. Lester\Crawford said on Monday.
Id: 51164 the news is: New version of the Efficeon processor us

In [25]:
from gensim.parsing.preprocessing import preprocess_string
doc = "Crude prices inflation the economy outlook earnings"
doc_tokenized = preprocess_string(doc)
doc_tokenized

['crude', 'price', 'inflat', 'economi', 'outlook', 'earn']

In [26]:
inferred_vector = model.infer_vector(doc_tokenized)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

  sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))


In [27]:
see = sims[0:10]
see

[('20917', 0.8012675046920776),
 ('20971', 0.7802791595458984),
 ('20633', 0.7754188179969788),
 ('77354', 0.769629180431366),
 ('39952', 0.7659934759140015),
 ('20560', 0.7553290724754333),
 ('109368', 0.7392926812171936),
 ('72103', 0.7335115671157837),
 ('53000', 0.7330428957939148),
 ('87562', 0.73142009973526)]

In [28]:
for i in range(len(see)):
  id = see[i][0]
  print('Id:',id, 'the news is:', train[int(id)])

Id: 20917 the news is: Oil prices eased further on Tuesday, as the head of OPEC cartel said that markets were well-supplied with crude and traders continued to take profits from this year #39;s 40 percent rally.
Id: 20971 the news is:  NEW YORK (Reuters) - U.S. oil prices eased on Tuesday as  the head of the OPEC cartel said markets were well-supplied  with crude and traders continued to take profits from this  year's 40 percent rally.
Id: 20633 the news is:  LONDON (Reuters) - Oil prices eased further on Tuesday, as  the head of the OPEC cartel said markets were well-supplied  with crude and traders continued to take profits from this  year's 40 percent rally.
Id: 77354 the news is:  NEW YORK (Reuters) - ChevronTexaco Corp.&lt;A HREF="http://www.investor.reuters.com/FullQuote.aspx?ticker=CVX.N target=/stocks/quickinfo/fullquote"&gt;CVX.N&lt;/A&gt;, the No. 2  U.S. oil company, on Friday reported a 62 percent rise in  quarterly profit from record oil prices and gains from asset  sales,