# 1. Basic Settings

In [50]:
import gensim
import os
import logging

# Log settings
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


# 2. Training - Import file list

In [3]:
from os import listdir
from os.path import isfile, join
docList = []
docList = [f for f in listdir("D:\\COCA & GloWbE\\COCA\\Pre-processed Text\\Science-related Texts\\") if f.endswith('.txt')]


In [51]:
len(docList)

10258

In [68]:
# Note that this imports ALL FILES in dir
class SentenceIterator(object):

    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield gensim.utils.simple_preprocess(line)

# a memory - friendly iterator
sentence_interator = SentenceIterator('D:\\COCA & GloWbE\\COCA\\Pre-processed Text\\Science-related Texts')

# Get doc labels
- Doc labels are *titles* of each text
- titles are at the forefront of a text, with the delimiters %&% front to back on the title

In [36]:
import re

directory = "D:\\COCA & GloWbE\\COCA\\Pre-processed Text\\Science-related Texts\\"
pattern = re.compile(r"%&%.*%&%") # to get title of text

titleList = []

for fileName in docList:
    
    with open(directory + fileName, encoding='utf-8') as f:
        text = f.read()
    
    matched = pattern.match(text)
    title = matched.group().strip("%&%")
    titleList.append(title)
    
    # print(title)

Load file data

In [81]:
texts = []

for file in os.listdir(directory):
    texts.append( str( open( os.path.join(directory, file), encoding='utf-8').readlines() ))

# Labeled Sentence interator

In [78]:
# import gensim
# LabeledSentence = gensim.models.doc2vec.LabeledSentence
# preprocess = gensim.utils.simple_preprocess


class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        LabeledSentence = gensim.models.doc2vec.LabeledSentence
        preprocess = gensim.utils.simple_preprocess
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield LabeledSentence(words=preprocess(doc),tags=[self.labels_list[idx]])

# Training

In [112]:
sentences = LabeledLineSentence(texts, titleList)

In [113]:
# This is too slow to run
model = gensim.models.Doc2Vec(size=300, window=10, min_count=5, workers=11,alpha=0.025, min_alpha=0.025) # use fixed learning rate
model.build_vocab(sentences)
for epoch in range(10):
    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no deca
    model.train(it, total_examples=model.corpus_count, epochs=model.iter)

2017-11-04 21:52:49,322 : INFO : collecting all words and their counts
2017-11-04 21:52:49,327 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2017-11-04 21:53:30,102 : INFO : PROGRESS: at example #10000, processed 23403653 words (573990/s), 199240 word types, 9206 tags
2017-11-04 21:53:30,589 : INFO : collected 199240 word types and 9206 unique tags from a corpus of 10258 examples and 23683065 words
2017-11-04 21:53:30,590 : INFO : Loading a fresh vocabulary
2017-11-04 21:53:31,731 : INFO : min_count=5 retains 72438 unique words (36% of original 199240, drops 126802)
2017-11-04 21:53:31,732 : INFO : min_count=5 leaves 23475162 word corpus (99% of original 23683065, drops 207903)
2017-11-04 21:53:32,046 : INFO : deleting the raw counts dictionary of 199240 items
2017-11-04 21:53:32,054 : INFO : sample=0.001 downsamples 32 most-common words
2017-11-04 21:53:32,056 : INFO : downsampling leaves estimated 18884695 word corpus (80.4% of prior 23475162)
2017-1

2017-11-04 21:54:43,087 : INFO : PROGRESS: at 20.58% examples, 281168 words/s, in_qsize 20, out_qsize 1
2017-11-04 21:54:44,095 : INFO : PROGRESS: at 20.83% examples, 281313 words/s, in_qsize 19, out_qsize 0
2017-11-04 21:54:45,106 : INFO : PROGRESS: at 21.11% examples, 281163 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:54:46,131 : INFO : PROGRESS: at 21.33% examples, 281139 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:54:47,179 : INFO : PROGRESS: at 21.61% examples, 281161 words/s, in_qsize 20, out_qsize 1
2017-11-04 21:54:48,180 : INFO : PROGRESS: at 21.83% examples, 281222 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:54:49,199 : INFO : PROGRESS: at 22.12% examples, 281208 words/s, in_qsize 20, out_qsize 1
2017-11-04 21:54:50,205 : INFO : PROGRESS: at 22.40% examples, 281508 words/s, in_qsize 18, out_qsize 0
2017-11-04 21:54:51,209 : INFO : PROGRESS: at 22.67% examples, 281310 words/s, in_qsize 20, out_qsize 1
2017-11-04 21:54:52,223 : INFO : PROGRESS: at 22.89% examples, 2

2017-11-04 21:56:03,637 : INFO : PROGRESS: at 43.92% examples, 281135 words/s, in_qsize 20, out_qsize 1
2017-11-04 21:56:04,643 : INFO : PROGRESS: at 44.11% examples, 281160 words/s, in_qsize 22, out_qsize 0
2017-11-04 21:56:05,652 : INFO : PROGRESS: at 44.30% examples, 281135 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:56:06,661 : INFO : PROGRESS: at 44.46% examples, 281009 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:56:07,676 : INFO : PROGRESS: at 44.68% examples, 280967 words/s, in_qsize 22, out_qsize 0
2017-11-04 21:56:08,687 : INFO : PROGRESS: at 44.90% examples, 280824 words/s, in_qsize 20, out_qsize 1
2017-11-04 21:56:09,722 : INFO : PROGRESS: at 45.15% examples, 280900 words/s, in_qsize 19, out_qsize 2
2017-11-04 21:56:10,751 : INFO : PROGRESS: at 45.39% examples, 280809 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:56:11,789 : INFO : PROGRESS: at 45.62% examples, 280884 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:56:12,821 : INFO : PROGRESS: at 45.82% examples, 2

2017-11-04 21:57:24,162 : INFO : PROGRESS: at 66.78% examples, 280833 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:57:25,198 : INFO : PROGRESS: at 67.04% examples, 280834 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:57:26,227 : INFO : PROGRESS: at 67.23% examples, 280946 words/s, in_qsize 22, out_qsize 0
2017-11-04 21:57:27,244 : INFO : PROGRESS: at 67.42% examples, 280846 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:57:28,256 : INFO : PROGRESS: at 67.65% examples, 280923 words/s, in_qsize 22, out_qsize 0
2017-11-04 21:57:29,282 : INFO : PROGRESS: at 67.89% examples, 280887 words/s, in_qsize 19, out_qsize 2
2017-11-04 21:57:30,287 : INFO : PROGRESS: at 68.09% examples, 280945 words/s, in_qsize 20, out_qsize 1
2017-11-04 21:57:31,333 : INFO : PROGRESS: at 68.43% examples, 280963 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:57:32,344 : INFO : PROGRESS: at 68.76% examples, 280987 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:57:33,365 : INFO : PROGRESS: at 69.13% examples, 2

2017-11-04 21:58:44,604 : INFO : PROGRESS: at 90.37% examples, 280525 words/s, in_qsize 20, out_qsize 1
2017-11-04 21:58:45,622 : INFO : PROGRESS: at 90.70% examples, 280510 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:58:46,672 : INFO : PROGRESS: at 91.02% examples, 280461 words/s, in_qsize 20, out_qsize 1
2017-11-04 21:58:47,693 : INFO : PROGRESS: at 91.40% examples, 280481 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:58:48,696 : INFO : PROGRESS: at 91.74% examples, 280488 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:58:49,715 : INFO : PROGRESS: at 92.08% examples, 280501 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:58:50,747 : INFO : PROGRESS: at 92.38% examples, 280420 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:58:51,760 : INFO : PROGRESS: at 92.71% examples, 280416 words/s, in_qsize 21, out_qsize 0
2017-11-04 21:58:52,771 : INFO : PROGRESS: at 93.05% examples, 280431 words/s, in_qsize 22, out_qsize 0
2017-11-04 21:58:53,783 : INFO : PROGRESS: at 93.39% examples, 2

NameError: name 'it' is not defined

In [114]:
model.wv.vocab

{'american': <gensim.models.keyedvectors.Vocab at 0x2610b63e438>,
 'icon': <gensim.models.keyedvectors.Vocab at 0x2617bc18390>,
 'section': <gensim.models.keyedvectors.Vocab at 0x2610b492668>,
 'at': <gensim.models.keyedvectors.Vocab at 0x2610b503390>,
 'the': <gensim.models.keyedvectors.Vocab at 0x2610bd24940>,
 'museum': <gensim.models.keyedvectors.Vocab at 0x2610bd24b38>,
 'centenary': <gensim.models.keyedvectors.Vocab at 0x2610bd24c50>,
 'of': <gensim.models.keyedvectors.Vocab at 0x2610bd24a20>,
 'her': <gensim.models.keyedvectors.Vocab at 0x2610bd24cc0>,
 'birth': <gensim.models.keyedvectors.Vocab at 0x2610bd24cf8>,
 'margaret': <gensim.models.keyedvectors.Vocab at 0x2610bd24d30>,
 'mead': <gensim.models.keyedvectors.Vocab at 0x2610bd24da0>,
 'is': <gensim.models.keyedvectors.Vocab at 0x2610bd24dd8>,
 'remembered': <gensim.models.keyedvectors.Vocab at 0x2610bd24e10>,
 'as': <gensim.models.keyedvectors.Vocab at 0x2610bd24198>,
 'world': <gensim.models.keyedvectors.Vocab at 0x2610bd

# 4. Save or Load model

In [131]:
# model.save("D:\\Models\\Doc2Vec(Sci_related).bin")
model = gensim.models.Doc2Vec.load("D:\\Models\\Doc2Vec(Sci_related).bin")

2017-11-04 22:06:04,007 : INFO : loading Doc2Vec object from D:\Models\Doc2Vec(Sci_related).bin
2017-11-04 22:06:04,430 : INFO : loading wv recursively from D:\Models\Doc2Vec(Sci_related).bin.wv.* with mmap=None
2017-11-04 22:06:04,432 : INFO : loading syn0 from D:\Models\Doc2Vec(Sci_related).bin.wv.syn0.npy with mmap=None
2017-11-04 22:06:04,480 : INFO : setting ignored attribute syn0norm to None
2017-11-04 22:06:04,482 : INFO : loading docvecs recursively from D:\Models\Doc2Vec(Sci_related).bin.docvecs.* with mmap=None
2017-11-04 22:06:04,484 : INFO : loading syn1neg from D:\Models\Doc2Vec(Sci_related).bin.syn1neg.npy with mmap=None
2017-11-04 22:06:04,541 : INFO : setting ignored attribute cum_table to None
2017-11-04 22:06:04,542 : INFO : loaded D:\Models\Doc2Vec(Sci_related).bin


# 5. Look-up for similar documents

In [132]:
model.similar_by_word("internet")

2017-11-04 22:06:04,760 : INFO : precomputing L2-norms of word weight vectors


[('web', 0.4793151617050171),
 ('facebook', 0.42751970887184143),
 ('broadband', 0.3999687731266022),
 ('network', 0.36445242166519165),
 ('openid', 0.3539382219314575),
 ('mail', 0.33882203698158264),
 ('wireless', 0.3299103081226349),
 ('blogs', 0.3297681212425232),
 ('edi', 0.32874196767807007),
 ('phishing', 0.3247973918914795)]

In [133]:
model.similar_by_word("data")

[('information', 0.4409201145172119),
 ('statistics', 0.3989573121070862),
 ('records', 0.36948490142822266),
 ('samples', 0.36515748500823975),
 ('observations', 0.3583360016345978),
 ('archives', 0.3361548185348511),
 ('measurements', 0.33312302827835083),
 ('readings', 0.3325759768486023),
 ('documents', 0.3289825916290283),
 ('timings', 0.3274507522583008)]

In [134]:
model.similar_by_word("developers")

[('programmers', 0.5002354383468628),
 ('users', 0.4927016496658325),
 ('vendors', 0.46623820066452026),
 ('designers', 0.42613762617111206),
 ('stutterers', 0.41394639015197754),
 ('ecologists', 0.4031423330307007),
 ('managers', 0.398607075214386),
 ('companies', 0.38800886273384094),
 ('manufacturers', 0.38262853026390076),
 ('advocates', 0.3826046884059906)]

In [154]:
vector = model.infer_vector("""data science""".split())
model.docvecs.most_similar([vector]) # similar docs & accuracy

[(' Fever Flow', 0.3851379156112671),
 (' Solar System Mysteries', 0.3793867230415344),
 (' Miniplanet sports a megapeak', 0.37395384907722473),
 (' Studying Biodiversity: Is a New Paradigm Really Needed? ',
  0.3729775547981262),
 (' Planets common as stars in galaxy', 0.372067391872406),
 (' Planck studies the microwave galaxy', 0.3694920241832733),
 (' Data Recovery', 0.3680240511894226),
 (' BENCHED SCIENCE.', 0.3646535873413086),
 (' Exoplanets make pictorial debut', 0.3609537184238434),
 (' When Science Turns Suspect The Office of Research Integrity--a.k.a., the Fraud Squad--is on the case.',
  0.3602791726589203)]

In [124]:
similar_docs = model.docvecs.most_similar([vector])
file_names = [i[0] for i in similar_docs] # slice the first elements
results = []
        
for name in file_names:
    text = open("D:\\COCA & GloWbE\\COCA\\Pre-processed Text\\Science-related Texts\\" + name, 'r')
    results.append(text.read())
    text.close()
        
i = 0

for text in results:
    print("Document", i+1, ":", file_names[i], "\t accuracy:", similar_docs[i][1], "\n", (text[0:500]), "\n" ) 
    i += 1

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\COCA & GloWbE\\COCA\\Pre-processed Text\\Science-related Texts\\ A new language will improve online applications Cloud Programming'