In [None]:
import os
import csv
import ast
import math
import spacy
import string
import glob
import multiprocessing
cores = multiprocessing.cpu_count()

import numpy as np
import pandas as pd
from time import time
from tqdm import tqdm
import en_core_web_sm
nlp = en_core_web_sm.load()

import gensim
import gensim.models as g
from gensim.test.utils import get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
def frame_tag_docs(path):
    '''
    SECTION: 4.6.1 - Training of Paragraph Embedding Model
    Function: - Modified form of document tagging. Extends the fixed training window of 10000 tokens to adapt to the length of
    the document. The method was explained in the discussion: 
    https://groups.google.com/g/gensim/c/YPT19ow_4Do/m/NWz56vLeBwAJ?pli=1
    '''
    file_name=[]
    tagged_documents=[]
    doc_count = 0

    for file in glob.glob(path):
        with open(file, 'r', encoding='utf8', errors= 'ignore') as infile:
            file_name.append(file)
            doc_content = infile.read().split()
            len_content = len(doc_content)
            if len_content <= 10000:
                tagged_documents.append(TaggedDocument(words=doc_content,tags=[doc_count]))
                doc_count = doc_count + 1
                print('Tagging document completed for', doc_count)

            else:
                limit_per_doc = 10000
                factor = math.ceil(len_content/limit_per_doc)
                for i in range(factor):
                    if i==0:
                        tagged_documents.append(TaggedDocument(words=doc_content[:limit_per_doc],tags=[doc_count]))
                    else:
                        tagged_documents.append(TaggedDocument(words=doc_content[limit_per_doc:(i+1)*10000],tags=[doc_count]))
                        if i != factor - 1:
                            limit_per_doc = limit_per_doc + 10000
                        else:
                            doc_count = doc_count + 1
                print('Tagging document completed for', doc_count)
                
    return tagged_documents


### Doc2Vec Hyper-parameters
**dm ({1,0}, optional)** – Defines the training algorithm. If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
* Choice of parameter - (dm = 0)
* Reason: https://www.aclweb.org/anthology/W16-1609.pdf
* Our findings reveal that dbow, despite being the simpler model, is superior to dmpv. 
* dbow favours longer windows for context words than dmpv.
* dmpv also requires more training epochs than dbow.

**vector_size (int, optional)** – Dimensionality of the feature vectors.
* Choice of parameter - vector size = 300
* Reason - https://www.aclweb.org/anthology/W16-1609.pdf

**window (int, optional)** – The maximum distance between the current and predicted word within a sentence.
* Choice of parameter - Window = 5
* If window = 4, your context is w-4, w-3, w-2, w-1, CENTER_WORD , w+1, w+2, w+3, w+4 (without center word).

**min_count (int, optional)** – Ignores all words with total frequency lower than this.
* Choice of parameter - min_count = 1
* Reason - https://groups.google.com/forum/#!topic/gensim/xKvUv-yZI2U
* As your dataset & vocabulary grow, words with just a few instances may be less interesting, and the added model size for a larger vocabulary may become a concern, so increasing min_count can make sense. 

**sample (float, optional)** – The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
* Choice of parameter - 1e-05
* Reason - https://groups.google.com/forum/#!topic/gensim/xKvUv-yZI2U
* Larger corpuses may benefit from a more-aggressive sample parameter (smaller value, eg 1e-05 or 1e-06), discarding more of the most-frequent words – and thus perhaps improving influence of less-common words, or freeing time for more passes or expansion of other parameters that would otherwise slow training.

**epochs (int, optional)** – Number of iterations (epochs) over the corpus.
* Choice of parameter - 20
* Reason - https://groups.google.com/forum/#!topic/gensim/xKvUv-yZI2U
* An epochs value of 10-20 is still likely to be a good starting point, perhaps trying more if your own evaluations can confirm improvement.

**hs or negative sampling** 
* Reason - https://stackoverflow.com/questions/46860197/doc2vec-and-word2vec-with-negative-sampling
* Hierarchical-softmax tends to get slower with larger vocabularies (because the average number of nodes involved in each training-example grows); negative-sampling does not (because it's always N+1 nodes). Projects with larger corpuses tend to trend towards preferring negative-sampling.

**hs ({1,0}, optional)** – If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
* Reason - https://groups.google.com/forum/#!topic/gensim/9EaJAl95cPw
* They are two alternative options for how to calculate the predictions of the neural-network, and thus also the errors to be back-propagated. They are relevant options for both Word2Vec (both CBOW or Skip-Gram) and Doc2Vec (both DBOW and DM). Neither have any effect on whether `window` is consulted: `negative`-vs-`hs` are ways to construct/interpret the NN output.

**negative (int, optional)** – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
* Choice: negative = 5
* Reason - http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/
* The paper says that selecting 5-20 words works well for smaller datasets, and you can get away with only 2-5 words for large datasets.

In [None]:
def model_build_train(tagged_documents):
    model= Doc2Vec(dm=0, vector_size=300, window_size = 5, sampling_threshold = 1e-5, 
                   negative=5, hs=0, min_count=1, workers=cores, epochs=20, alpha = 0.025, 
                   min_alpha = 0.00025, ns_exponent=0.75)

    model.build_vocab([x for x in tqdm(tagged_documents)])
    model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)
    model.save(r'C:\Users\Shrikanth Singh\Desktop\Thesis-Note-to-Py\doc2vec.model')
    print('Model Saved')

In [None]:
if __name__ == '__main__':
    path = r"C:\Users\Shrikanth Singh\Desktop\Thesis-Note-to-Py\sample files for word2vec\*"
    tagged_documents = frame_tag_docs(path)
    model_build_train(tagged_documents)