In [102]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import en_core_web_sm
import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [103]:
df = pd.read_csv('C:\\Users\\rewal\\Downloads\\NLPtask\\Cleaned_Text\\combined.txt')
df.shape

(96383, 1)

In [104]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

dtype: int64

CLEANING

We are lemmatizing and removing the stopwords and non-alphabetic characters for each line of dialogue

In [107]:
nlp = en_core_web_sm.load(disable=['ner', 'parser'])# disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [109]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df)

In [110]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.05 mins


In [111]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna()
df_clean.shape

(1, 1)

In [117]:
from gensim.models.phrases import Phrases, Phraser #bigram detection

In [118]:
sent = [row.split() for row in df_clean['clean']]

In [119]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

2020-04-21 17:27:00,069 : INFO : collecting all words and their counts
2020-04-21 17:27:00,070 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2020-04-21 17:27:00,142 : INFO : collected 16687 word types from a corpus of 23169 words (unigram + bigrams) and 1 sentences
2020-04-21 17:27:00,144 : INFO : using 16687 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>


In [121]:
bigram = Phraser(phrases)
sentences = bigram[sent] #Transform the corpus based on the bigrams detected

2020-04-21 17:28:24,860 : INFO : source_vocab length 16687
2020-04-21 17:28:25,237 : INFO : Phraser built with 17 phrasegrams


In [122]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

2177

In [123]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['display',
 'machine',
 'datum',
 'mtg',
 'work',
 'valve',
 'implement',
 'system',
 'application',
 'autotrac']

Time to Train the model

In [124]:
import multiprocessing
from gensim.models import Word2Vec

In [125]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer


The parameters:

    min_count = int - Ignores all words with total absolute frequency lower than this - (2, 100)

    window = int - The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the left of our target - (2, 10)

    size = int - Dimensionality of the feature vectors. - (50, 300)

    sample = float - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5)

    alpha = float - The initial learning rate - (0.01, 0.05)

    min_alpha = float - Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00

    negative = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)

    workers = int - Use these many worker threads to train the model (=faster training with multicore machines)



In [126]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

Building the Vocabulary Table:

Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):


In [127]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

2020-04-21 17:31:34,020 : INFO : collecting all words and their counts
2020-04-21 17:31:34,142 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-21 17:31:34,148 : INFO : collected 2177 word types from a corpus of 22199 raw words and 1 sentences
2020-04-21 17:31:34,149 : INFO : Loading a fresh vocabulary
2020-04-21 17:31:34,156 : INFO : effective_min_count=20 retains 273 unique words (12% of original 2177, drops 1904)
2020-04-21 17:31:34,157 : INFO : effective_min_count=20 leaves 14579 word corpus (65% of original 22199, drops 7620)
2020-04-21 17:31:34,163 : INFO : deleting the raw counts dictionary of 2177 items
2020-04-21 17:31:34,165 : INFO : sample=6e-05 downsamples 273 most-common words
2020-04-21 17:31:34,166 : INFO : downsampling leaves estimated 2011 word corpus (13.8% of prior 14579)
2020-04-21 17:31:34,172 : INFO : estimated required memory for 273 words and 300 dimensions: 791700 bytes
2020-04-21 17:31:34,173 : INFO : resetting layer weights


Time to build vocab: 0.0 mins



Training of the model:

Parameters of the training:

    total_examples = int - Count of sentences;
    epochs = int - Number of iterations (epochs) over the corpus - [10, 20, 30]



In [128]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

2020-04-21 17:32:23,310 : INFO : training model with 3 workers on 273 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
2020-04-21 17:32:23,460 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-21 17:32:23,464 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-21 17:32:23,491 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-21 17:32:23,493 : INFO : EPOCH - 1 : training on 22199 raw words (2082 effective words) took 0.2s, 13124 effective words/s
2020-04-21 17:32:23,608 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-21 17:32:23,610 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-21 17:32:23,628 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-21 17:32:23,630 : INFO : EPOCH - 2 : training on 22199 raw words (2015 effective words) took 0.1s, 17304 effective words/s
2020-04-21 17:32:23,741 : INFO : worker thr

2020-04-21 17:32:26,697 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-21 17:32:26,707 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-21 17:32:26,722 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-21 17:32:26,724 : INFO : EPOCH - 21 : training on 22199 raw words (2057 effective words) took 0.1s, 15103 effective words/s
2020-04-21 17:32:26,852 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-21 17:32:26,854 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-21 17:32:26,874 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-21 17:32:26,876 : INFO : EPOCH - 22 : training on 22199 raw words (2074 effective words) took 0.1s, 15975 effective words/s
2020-04-21 17:32:27,004 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-21 17:32:27,005 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-

Time to train the model: 0.08 mins


As we do not plan to train the model any further, we are calling init_sims(), which will make the model much more memory-efficient:

In [129]:
w2v_model.init_sims(replace=True)

2020-04-21 17:33:35,170 : INFO : precomputing L2-norms of word weight vectors


In [130]:
w2v_model.wv.most_similar(positive=["autotrac"])

[('turn_automation', 0.9999270439147949),
 ('guidance', 0.999924898147583),
 ('implement', 0.999916136264801),
 ('activation', 0.9999126195907593),
 ('profile', 0.9999101161956787),
 ('guidance_line', 0.9999090433120728),
 ('automation', 0.9999086260795593),
 ('application', 0.9999067783355713),
 ('select', 0.9999051690101624),
 ('tab', 0.9999046325683594)]