In [None]:
import os
import nltk
import spacy
import glob
import pandas as pd
import numpy as np
import multiprocessing
nltk.download('punkt')
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from timeit import default_timer as timer
nlp = spacy.load('en', disable=['ner', 'parser'])

In [None]:
def lemmatization(doc):
    doc = nlp(doc)
    txt = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(txt)
    
    
def display_closestwords_tsnescatterplot(model, word):
    '''
    Figure 4.6: TNSE plot for similar words
    Function: Generates a t-Distributed Stochastic Neighbor Embedding graph to view the goodness of the model by analysing the
    distribution of the high-dimensional data by assigning each data point to a location on a two dimensional plane.
    '''
    
    arr = np.empty((0,100), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()
    
    
def word2vec_training(tokens):
    '''
    SECTION - 4.4.3 - Similar Word Replacement
    Function: Defines the word2vec model with its hyperparameters, builds the vocabulary from the dataset and finally saves the 
    model at a desired location.
    '''
    start = timer()
    print('Generating word embeddings...')

    cores = multiprocessing.cpu_count()
    print('- using {} out of {} CPU cores'.format(cores - 1, cores))
    
    # Defining the hyperparameters for the word2vec model. 
    model = Word2Vec(min_count=10,  
                    window=5,
                    size=100,  
                    sample=6e-5,  
                    alpha=0.03,
                    min_alpha=0.0007,
                    workers=cores - 1)

    # build the vocabulary table
    print('- building the vocabulary table')
    model.build_vocab(tokens, progress_per=10000)

    # model training
    print('- training the word2vec model')
    model.train(tokens, total_examples=model.corpus_count, epochs=20, report_delay=1)

    model.save('Thesis - Dataset and Transformations/word2vec/Word2Vec_100d.model')
    #print('- word2vec model saved at {}'.format(path))

    end = timer()
    print('- took {:.2f} minutes'.format((end - start) / 60))

In [None]:
if __name__ == '__main__':
    
    tokenized_dataset = list()
    #Iterate over each file in each folder in order.
    for file in glob.glob(r"Thesis - Dataset and Transformations/doc2vec/Train Docs.zip/*"):
        with open(file, 'r', encoding='utf8', errors= 'ignore') as infile:
            doc_content = lemmatization(infile.read())
        tokenized_dataset.append(doc_content.split())
    
    word2vec_training(tokenized_dataset)