# Introduction

Notebook used for training a new Word Embedding as extension of the GloVe-based Word Embedding available trained with the Common Crawln words dataset.

# Load Libraries


In [8]:
from mod_finder_util import mod_finder_util
mod_finder_util.add_modules_origin_search_path()

import pandas as pd
import numpy as np

from gensim.models import Word2Vec

from modules.utils import aux_functions
from modules.utils import firefox_dataset_p2 as fd
from modules.utils import tokenizers as tok

bugreports = fd.OrigDatasets.read_orig_bugreports_df()
testcases = fd.Datasets.read_testcases_df()
features = fd.Datasets.read_features_df()

OrigBugReports.shape: (35336, 18)
TestCases.shape: (195, 12)
Features.shape: (19, 8)


# Load Sentences

In [3]:
tokenizer = tok.PorterStemmerBased_Tokenizer()

all_sentences = []
for br in bugreports.br_desc:
    all_sentences.append(tokenizer.__call__(br))

for tc in testcases.tc_desc:
    all_sentences.append(tokenizer.__call__(tc))

for ft in features.feat_desc:
    all_sentences.append(tokenizer.__call__(ft))

# Training Model with Gensim Word2Vec

In [4]:
model = Word2Vec(all_sentences, 
                 min_count=3,   # Ignore words that appear less than this
                 size=300,      # Dimensionality of word embeddings
                 workers=2,     # Number of processors (parallelisation)
                 window=5,      # Context window for words during training
                 iter=30)

In [5]:
len(model.wv.vocab)

10678

In [6]:
model.most_similar('awesom')

  """Entry point for launching an IPython kernel.


[('locat', 0.5706654787063599),
 ('address', 0.47330430150032043),
 ('search', 0.46685847640037537),
 ('adress', 0.45979756116867065),
 ('url', 0.4409191906452179),
 ('awesomebar', 0.40817999839782715),
 ('nav', 0.4074421525001526),
 ('titl', 0.3626036047935486),
 ('unifi', 0.3446365296840668),
 ('hite', 0.34205934405326843)]

# Save Trained Model

In [17]:
cust_model_path = '../data/mozilla_firefox_v2/firefoxDataset/wv_embeddings/cust_wv_model.txt'
model.wv.save_word2vec_format(cust_model_path)

# Test Loading Model with SpaCy

In [19]:
import spacy
nlp = spacy.load(cust_model_path.replace('.txt',''))

In [21]:
nlp.vocab.length

11592