In [None]:
import spacy
import langdetect
import glob
nlp = spacy.load('en')
from nltk.tokenize import sent_tokenize
import pandas as pd
import numpy as np
import re
from gensim.models.phrases import Phrases,Phraser
from nltk.corpus import stopwords
stopwords = list(set(stopwords.words('english')))
import random
from modules import utils

In [None]:
crawled = pd.read_csv("./data/crawl_data(7379 pages - employment-social-development).csv").drop_duplicates(subset=["text"], keep="first")
crawled["lang"] = crawled.text.apply(lambda x: langdetect.detect(str(x)))
crawled = crawled[crawled["lang"] == "en"].text;len(crawled)

In [None]:
crawled.head()

In [None]:
corpus = crawled.str.cat(sep=" ")

In [None]:
corpus[:4000]

In [None]:
corpus_clean = re.sub('[^a-zA-Z0-9\s\.]+', '', corpus).lower()
corpus_clean = re.sub("\s{2,}"," ",corpus_clean)

In [None]:
corpus_clean[:800]

In [None]:
sents = sent_tokenize(corpus_clean)
sents = list(set(sents))
sents = [re.sub('[^a-zA-Z0-9\s]+', '', sent) for sent in sents];len(sents)

In [None]:
print(random.choice(sents))

In [None]:
sents_stream = [sent.split() for sent in sents];sents_stream[1]

## Generate N-Grams From Co-Occuring Terms

"default" scoring: <i>from “Efficient Estimaton of Word Representations in Vector Space” by
Mikolov, et. al.: (count(worda followed by wordb) - min_count) * N / (count(worda) * count(wordb)) > threshold`, where N is the total vocabulary size.</i>


"npmi" scoring: <i>normalized pointwise mutual information, from “Normalized (Pointwise) Mutual
Information in Colocation Extraction” by Gerlof Bouma: ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / - ln(prop(worda followed by wordb) where prop(n) is the count of n / the count of everything in the entire corpus.</i>

In [None]:
def generate_n_gram_transformers(stream,n_gram = 3,scoring="default",min_count=5,threshold=10,common_terms=None):
    streams = [stream]    
    grams = [stream]
    for n in range(1,n_gram):
        gram = Phraser(Phrases(streams[-1],scoring=scoring,min_count=min_count,threshold=threshold,common_terms=common_terms))
        streams.append(list(gram[streams[-1]]))
        grams.append(gram)
        
    return grams
        

In [None]:
_,to_bigrams,to_trigrams,to_quadgrams = generate_n_gram_transformers(sents_stream,n_gram=4,
                                                   scoring="default",min_count=30,
                                                   threshold=10,common_terms=stopwords)

In [None]:
ex_sent_stream = "employment and social development canada service canada is serving canadians to have better lives".split()

In [None]:
to_bigrams[ex_sent_stream]

In [None]:
to_trigrams[to_bigrams[ex_sent_stream]]

In [None]:
to_quadgrams[to_trigrams[to_bigrams[ex_sent_stream]]]

In [None]:
quad_stream = list(to_quadgrams[to_trigrams[to_bigrams[sents_stream]]])
tri_stream = list(to_trigrams[to_bigrams[sents_stream]])
quad_sents = [' '.join(sent) for sent in quad_stream]
tri_sents = [' '.join(sent) for sent in tri_stream]

In [None]:
random.choice(quad_stream)

## Word2Vec

In [None]:
from gensim.models.word2vec import Word2Vec

In [None]:
WORD2VEC_EMBEDDING_DIM = 50

In [None]:
model = Word2Vec(quad_stream, size=WORD2VEC_EMBEDDING_DIM, window=12, min_count=30, workers=4,iter=30)

In [None]:
model.wv.most_similar("requirement")

In [82]:
model.wv.save_word2vec_format("./data/word2vec_esdc.vec")

In [86]:
pretrained_embeddings = utils.load_embedding_model("/Users/WASSIMATHIMNI/data/embeddings/glove/glove.6B.100d.txt")

In [88]:
len(model.wv.vocab.keys()),len(pretrained_embeddings)

(10545, 400000)

## Sidecar Approach - Concatenate Corpus Specific Trained Embeddings to General Pretrained Embeddings 

In [89]:
PRETRAINED_DIM = 100

In [90]:
CONCAT_DIM = PRETRAINED_DIM+WORD2VEC_EMBEDDING_DIM
concat_embeddings = {}
for key,vec in pretrained_embeddings.items():
    if key in model.wv.vocab:
        concat_embeddings[key] = np.hstack((vec,model.wv[key]))
    else:
        concat_embeddings[key] = np.hstack((vec,np.zeros(WORD2VEC_EMBEDDING_DIM)))
for key in model.wv.vocab.keys():
    if key not in concat_embeddings:
        concat_embeddings[key] = np.hstack((np.zeros(PRETRAINED_DIM),model.wv[key]))

In [91]:
"hockey" in concat_embeddings,"service_canada" in concat_embeddings,

(True, True)

In [93]:
concat_embeddings["dog"]

array([ 0.30817  ,  0.30938  ,  0.52803  , -0.92543  , -0.73671  ,
        0.63475  ,  0.44197  ,  0.10262  , -0.09142  , -0.56607  ,
       -0.5327   ,  0.2013   ,  0.7704   , -0.13983  ,  0.13727  ,
        1.1128   ,  0.89301  , -0.17869  , -0.0019722,  0.57289  ,
        0.59479  ,  0.50428  , -0.28991  , -1.3491   ,  0.42756  ,
        1.2748   , -1.1613   , -0.41084  ,  0.042804 ,  0.54866  ,
        0.18897  ,  0.3759   ,  0.58035  ,  0.66975  ,  0.81156  ,
        0.93864  , -0.51005  , -0.070079 ,  0.82819  , -0.35346  ,
        0.21086  , -0.24412  , -0.16554  , -0.78358  , -0.48482  ,
        0.38968  , -0.86356  , -0.016391 ,  0.31984  , -0.49246  ,
       -0.069363 ,  0.018869 , -0.098286 ,  1.3126   , -0.12116  ,
       -1.2399   , -0.091429 ,  0.35294  ,  0.64645  ,  0.089642 ,
        0.70294  ,  1.1244   ,  0.38639  ,  0.52084  ,  0.98787  ,
        0.79952  , -0.34625  ,  0.14095  ,  0.80167  ,  0.20987  ,
       -0.86007  , -0.15308  ,  0.074523 ,  0.40816  ,  0.0192

In [94]:
from scipy.spatial import distance

In [132]:
def retrieve_closest_embeddings(word,embeddings,num_results=10):
    keys =  [key for key,_ in embeddings.items()]
    embs =  [emb for _,emb in embeddings.items()]
    

    distances = distance.cdist(embs,[embeddings[word]])
    
    closest_idx = sorted(range(len(distances)),key=lambda k : distances[k])
    
    results = [(keys[idx],distances[idx][0]) for idx in closest_idx[:num_results]]
    return results[1:]

In [133]:
word = "security"

On our trained model

In [134]:
model.wv.most_similar(word)

[('safeguarding', 0.6965300440788269),
 ('holdings', 0.6321510076522827),
 ('privacy_and_security', 0.6195051074028015),
 ('sensitive', 0.6101369261741638),
 ('privacy', 0.5911946296691895),
 ('retention', 0.5894044637680054),
 ('integrity', 0.5829138159751892),
 ('protocols', 0.5738561153411865),
 ('management', 0.5725846886634827),
 ('protection', 0.572011411190033)]

On the pretrained model

In [135]:
retrieve_closest_embeddings(word,pretrained_embeddings)

[('officials', 4.5959739965486675),
 ('military', 4.618754130871209),
 ('personnel', 4.637935129688064),
 ('enforcement', 4.697978849367469),
 ('civilian', 4.737160836506255),
 ('control', 4.789275243116566),
 ('special', 4.791420784860835),
 ('administration', 4.818851646703397),
 ('government', 4.873565346693941)]

on concatenated model

In [136]:
retrieve_closest_embeddings(word,concat_embeddings)

[('safeguarding', 14.322703175552572),
 ('privacy_and_security', 15.323617480407206),
 ('sensitive', 15.462598341415253),
 ('holdings', 15.501257884537067),
 ('safeguard', 15.570697252166168),
 ('social_insurance_register', 15.85231032336952),
 ('risk_management', 15.921606275469456),
 ('monitors', 16.0446182458727),
 ('stewardship', 16.081221737471804)]

# Save Model

In [115]:
with open("/Users/WASSIMATHIMNI/data/embeddings/glove/esdc_glove_150d.vec",'w',encoding="utf-8") as f:
    for key,emb in concat_embeddings.items():
        f.write('{} {}\n'.format(str(key),' '.join([str(num) for num in emb])))