In [None]:
# default_exp repr.word2vec.train

# Training Word2Vec and Doc2Vec for SE

> This module comprises all modules required for training a model for processing SE data
>
> Author: @danaderp April 2020

In [None]:
# export
# Imports
import numpy as np
import pandas as pd

from pathlib import Path

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

unable to import 'smart_open.gcs', disabling that module


### ToDo
Enhance the training and testing with [keyedvectors](https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.most_similar)

In [None]:
#hide
#!pip install -U gensim
#!pip install -U scikit-learn
!pip install plotly==4.6.0

Collecting plotly==4.6.0
  Downloading plotly-4.6.0-py2.py3-none-any.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 3.4 MB/s eta 0:00:01
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=9530 sha256=694bfbbc95b898eb1f9a10e79ba60e53f5ebb8fa2b905127c7d725d3668a53ca
  Stored in directory: /root/.cache/pip/wheels/ac/cb/8a/b27bf6323e2f4c462dcbf77d70b7c5e7868a7fbe12871770cf
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.6.0 retrying-1.3.3


In [None]:
#hide
from nbdev.showdoc import *

In [None]:
path = get_tmpfile("word2vec_libest.model")

In [None]:
artifacts = pd.read_csv('/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-all].csv')

In [None]:
texts = [doc.split() for doc in list(artifacts['0'])]

In [None]:
texts

[['unit',
  'test',
  'user',
  'stori',
  'server',
  'simpl',
  'enrol',
  'august',
  'copyright',
  'cisco',
  'system',
  'inc',
  'right',
  'reserv',
  'includ',
  'stdio',
  'ifndef',
  'win',
  'includ',
  'unistd',
  'endif',
  'includ',
  'est',
  'includ',
  'curl',
  'curl',
  'includ',
  'curl',
  'util',
  'includ',
  'test',
  'util',
  'includ',
  'server',
  'includ',
  'openssl',
  'ssl',
  'ifdef',
  'cunit',
  'includ',
  'cunit',
  'basic',
  'includ',
  'cunit',
  'autom',
  'endif',
  'ifndef',
  'win',
  'static',
  'char',
  'test',
  'outfil',
  'filenam',
  'max',
  'test',
  'hdr',
  'defin',
  'cacert',
  'est',
  'cacert',
  'crt',
  'defin',
  'explicit_cert',
  'us903',
  'cert',
  'pem',
  'defin',
  'us903_explicit_key',
  'us903',
  'key',
  'pem',
  'defin',
  'us903_cacert',
  'est',
  'cacert',
  'crt',
  'defin',
  'us903_trusted_cert',
  'trustedcert',
  'crt',
  'defin',
  'est',
  'privat',
  'estservercertandkey',
  'pem',
  'els',
  'static'

In [None]:
common_texts 

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [None]:
model = Word2Vec(texts, size=100, window=5, min_count=1, workers=4)
model.save("word2vec_libest.model")

# Training Doc2Vec for SE

Le and Mikolov in 2014 introduced the Doc2Vec algorithm, which usually outperforms such simple-averaging of Word2Vec vectors [(link)](https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py).

PV-DM is analogous to Word2Vec CBOW. The doc-vectors are obtained by training a neural network on the synthetic task of predicting a center word based an average of both context word-vectors and the full document’s doc-vector.

PV-DBOW is analogous to Word2Vec SG. The doc-vectors are obtained by training a neural network on the synthetic task of predicting a target word just from the full document’s doc-vector. (It is also common to combine this with skip-gram testing, using both the doc-vector and nearby word-vectors to predict a single target word, but only one at a time.)

In [None]:
import math as m
import gensim
import collections
import random as r

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
path = get_tmpfile("test_data/models/doc2vec_libest.model")

In [None]:
artifacts = pd.read_csv('/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-all].csv')

In [None]:
texts = [doc.split() for doc in list(artifacts['0'])]
texts

[['unit',
  'test',
  'user',
  'stori',
  'server',
  'simpl',
  'enrol',
  'august',
  'copyright',
  'cisco',
  'system',
  'inc',
  'right',
  'reserv',
  'includ',
  'stdio',
  'ifndef',
  'win',
  'includ',
  'unistd',
  'endif',
  'includ',
  'est',
  'includ',
  'curl',
  'curl',
  'includ',
  'curl',
  'util',
  'includ',
  'test',
  'util',
  'includ',
  'server',
  'includ',
  'openssl',
  'ssl',
  'ifdef',
  'cunit',
  'includ',
  'cunit',
  'basic',
  'includ',
  'cunit',
  'autom',
  'endif',
  'ifndef',
  'win',
  'static',
  'char',
  'test',
  'outfil',
  'filenam',
  'max',
  'test',
  'hdr',
  'defin',
  'cacert',
  'est',
  'cacert',
  'crt',
  'defin',
  'explicit_cert',
  'us903',
  'cert',
  'pem',
  'defin',
  'us903_explicit_key',
  'us903',
  'key',
  'pem',
  'defin',
  'us903_cacert',
  'est',
  'cacert',
  'crt',
  'defin',
  'us903_trusted_cert',
  'trustedcert',
  'crt',
  'defin',
  'est',
  'privat',
  'estservercertandkey',
  'pem',
  'els',
  'static'

In [None]:
s = m.floor(len(texts)*0.1)

In [None]:
test_corpus = [i[1] for i in enumerate(texts[:s])]

In [None]:
test_corpus[:1]

[['unit',
  'test',
  'user',
  'stori',
  'server',
  'simpl',
  'enrol',
  'august',
  'copyright',
  'cisco',
  'system',
  'inc',
  'right',
  'reserv',
  'includ',
  'stdio',
  'ifndef',
  'win',
  'includ',
  'unistd',
  'endif',
  'includ',
  'est',
  'includ',
  'curl',
  'curl',
  'includ',
  'curl',
  'util',
  'includ',
  'test',
  'util',
  'includ',
  'server',
  'includ',
  'openssl',
  'ssl',
  'ifdef',
  'cunit',
  'includ',
  'cunit',
  'basic',
  'includ',
  'cunit',
  'autom',
  'endif',
  'ifndef',
  'win',
  'static',
  'char',
  'test',
  'outfil',
  'filenam',
  'max',
  'test',
  'hdr',
  'defin',
  'cacert',
  'est',
  'cacert',
  'crt',
  'defin',
  'explicit_cert',
  'us903',
  'cert',
  'pem',
  'defin',
  'us903_explicit_key',
  'us903',
  'key',
  'pem',
  'defin',
  'us903_cacert',
  'est',
  'cacert',
  'crt',
  'defin',
  'us903_trusted_cert',
  'trustedcert',
  'crt',
  'defin',
  'est',
  'privat',
  'estservercertandkey',
  'pem',
  'els',
  'static'

In [None]:
train_corpus = [gensim.models.doc2vec.TaggedDocument(i[1], [i[0]]) for i in enumerate(texts[s:])]

In [None]:
train_corpus[:1]

[TaggedDocument(words=['unit', 'test', 'user', 'stori', 'proxi', 'cacert', 'novemb', 'copyright', 'cisco', 'system', 'inc', 'right', 'reserv', 'proxi', 'mode', 'primarili', 'server', 'mode', 'process', 'certain', 'request', 'client', 'pass', 'long', 'upstream', 'server', 'use', 'client', 'mode', 'function', 'case', 'get', 'cacert', 'proxi', 'mode', 'function', 'almost', 'ident', 'server', 'mode', 'cert', 'respons', 'chain', 'pass', 'sent', 'repli', 'get', 'cacert', 'request', 'downstream', 'client', 'test', 'code', 'taken', 'larg', 'server', 'get', 'cacert', 'includ', 'stdio', 'ifndef', 'win', 'includ', 'unistd', 'endif', 'includ', 'est', 'includ', 'curl', 'curl', 'includ', 'curl', 'util', 'includ', 'test', 'util', 'includ', 'openssl', 'ssl', 'includ', 'server', 'includ', 'proxi', 'ifdef', 'cunit', 'includ', 'cunit', 'basic', 'includ', 'cunit', 'autom', 'endif', 'defin', 'pkcs', 'req', 'miichj', 'ccaw4caqaw', 'qtel', 'mcmga1ueax', 'mccm', 'igj5ignsa', 'wvud', 'cbpbi', 'zw1v', 'ihn0', '

In [None]:
#min_count -> infrequent words are ignored (less than 2)
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=50) #Parameters



In [None]:
model.build_vocab(train_corpus) #Building the vocabulary

2020-05-02 02:19:47,684 : INFO : collecting all words and their counts
2020-05-02 02:19:47,686 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-05-02 02:19:47,702 : INFO : collected 6359 word types and 79 unique tags from a corpus of 79 examples and 71978 words
2020-05-02 02:19:47,703 : INFO : Loading a fresh vocabulary
2020-05-02 02:19:47,712 : INFO : effective_min_count=2 retains 4160 unique words (65% of original 6359, drops 2199)
2020-05-02 02:19:47,714 : INFO : effective_min_count=2 leaves 69779 word corpus (96% of original 71978, drops 2199)
2020-05-02 02:19:47,725 : INFO : deleting the raw counts dictionary of 6359 items
2020-05-02 02:19:47,727 : INFO : sample=0.001 downsamples 57 most-common words
2020-05-02 02:19:47,727 : INFO : downsampling leaves estimated 57528 word corpus (82.4% of prior 69779)
2020-05-02 02:19:47,736 : INFO : estimated required memory for 4160 words and 50 dimensions: 3759800 bytes
2020-05-02 02:19:47,737 : INFO : reset

In [None]:
model.wv.vocab

{'unit': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00d68>,
 'test': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00828>,
 'user': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00c18>,
 'stori': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00f28>,
 'proxi': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00e80>,
 'cacert': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00b00>,
 'novemb': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00ba8>,
 'copyright': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00da0>,
 'cisco': <gensim.models.keyedvectors.Vocab at 0x7fd7feb007b8>,
 'system': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00dd8>,
 'inc': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00e48>,
 'right': <gensim.models.keyedvectors.Vocab at 0x7fd7feb009e8>,
 'reserv': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00f60>,
 'mode': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00fd0>,
 'server': <gensim.models.keyedvectors.Vocab at 0x7fd7feb00860>,
 'process': <gensim.models.keyedvecto

In [None]:
print(model.corpus_count,model.epochs)

79 50


In [None]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs) #Training the model

2020-05-02 02:24:15,189 : INFO : training model with 3 workers on 4160 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-05-02 02:24:15,247 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-05-02 02:24:15,251 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-05-02 02:24:15,254 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-05-02 02:24:15,255 : INFO : EPOCH - 1 : training on 71978 raw words (57585 effective words) took 0.1s, 999982 effective words/s
2020-05-02 02:24:15,302 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-05-02 02:24:15,304 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-05-02 02:24:15,308 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-05-02 02:24:15,309 : INFO : EPOCH - 2 : training on 71978 raw words (57503 effective words) took 0.1s, 1131623 effective words/s
2020-05-02 02:24:15,356 : INFO : worker

2020-05-02 02:24:16,334 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-05-02 02:24:16,336 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-05-02 02:24:16,341 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-05-02 02:24:16,341 : INFO : EPOCH - 21 : training on 71978 raw words (57631 effective words) took 0.0s, 1209954 effective words/s
2020-05-02 02:24:16,384 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-05-02 02:24:16,387 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-05-02 02:24:16,391 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-05-02 02:24:16,392 : INFO : EPOCH - 22 : training on 71978 raw words (57575 effective words) took 0.0s, 1188105 effective words/s
2020-05-02 02:24:16,440 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-05-02 02:24:16,442 : INFO : worker thread finished; awaiting finish of 1 more threads
20

2020-05-02 02:24:17,431 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-05-02 02:24:17,432 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-05-02 02:24:17,433 : INFO : EPOCH - 41 : training on 71978 raw words (57602 effective words) took 0.1s, 1147310 effective words/s
2020-05-02 02:24:17,475 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-05-02 02:24:17,482 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-05-02 02:24:17,484 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-05-02 02:24:17,484 : INFO : EPOCH - 42 : training on 71978 raw words (57639 effective words) took 0.0s, 1169380 effective words/s
2020-05-02 02:24:17,526 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-05-02 02:24:17,530 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-05-02 02:24:17,536 : INFO : worker thread finished; awaiting finish of 0 more threads
20

In [None]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
vector

[-3.8799790e-01 -1.0759743e-01 -8.5099328e-01 -1.6261047e-01
 -4.6120614e-01 -4.1150653e-01 -4.8968711e-01 -1.5730537e-01
 -6.8020940e-01  3.0621865e-01  5.4173124e-01  2.5272253e-01
 -5.0421804e-01  2.3174997e-01 -9.8899305e-03 -6.3132226e-01
  1.8082513e-01 -9.5351160e-02  3.0821612e-01 -1.1720957e-02
  5.0217199e-01 -3.4657633e-01 -3.1743407e-02 -4.2242369e-01
  2.9581904e-01 -3.9127624e-01 -3.9710104e-01 -2.1949653e-01
 -7.2293822e-04 -6.3064796e-01  2.5287136e-01 -2.2371596e-01
 -1.9622575e-02 -6.2112415e-01  2.9001772e-01 -1.8906260e-01
 -1.1596942e-01 -2.4680687e-01  6.5607691e-01  7.7220105e-02
 -4.2044306e-01  2.7088323e-01  2.2468549e-01 -4.5313707e-01
  6.5325224e-01  6.1260074e-01 -2.9224026e-01 -3.3492854e-01
 -4.5888122e-02  4.1145048e-01]


In [None]:
model.save("test_data/models/doc2vec_libest.model")

2020-05-04 19:54:03,940 : INFO : saving Doc2Vec object under test_data/models/doc2vec_libest.model, separately None
2020-05-04 19:54:03,971 : INFO : saved test_data/models/doc2vec_libest.model
