In [1]:
import spacy
import langdetect
import glob
nlp = spacy.load('en')
from nltk.tokenize import sent_tokenize
import pandas as pd
import numpy as np
import re
from gensim.models.phrases import Phrases,Phraser
from nltk.corpus import stopwords
stopwords = list(set(stopwords.words('english')))
import random
from modules import utils

In [2]:
crawled = pd.read_csv("./data/crawl_data(7379 pages - employment-social-development).csv").drop_duplicates(subset=["text"], keep="first")
crawled["lang"] = crawled.text.apply(lambda x: langdetect.detect(str(x)))
crawled = crawled[crawled["lang"] == "en"].text;len(crawled)

2941

In [3]:
crawled.head()

0    Employment  and Social Development Canada (ESD...
1    The Government of Canada is prepared to suppor...
2    The Government of Canada launches funding oppo...
3    Minister Duclos releases report on nationwide ...
4    Hire a temporary worker through the Temporary ...
Name: text, dtype: object

In [4]:
corpus = crawled.str.cat(sep=" ")

In [5]:
corpus[:4000]

"Employment  and Social Development Canada (ESDC) works to improve the standard of living  and quality of life for all Canadians. We do this by promoting a labour force  that is highly skilled. We also promote an efficient and inclusive labour  market. The Government of Canada is ready to support workers and their families who are affected by the April 24, 2017, U.S. decision to impose duties on Canadian softwood lumber products. Tab 1: Enabling Accessibility Fund: mid-sized projects Tab 2: Help design the new Canada Service Corps program Tab 3: See what Canadians had to say about reducing poverty Help design the new Canada Service Corps program See what Canadians had to say about reducing poverty Benefits, Canada EI Commission, Wage Earners Protection Program, and economic regions. Payment dates for recurring Government of Canada benefit payments. Job opportunities, work permits, Social Insurance Number, criminal record checks and security clearances. Canada Pension Plan, Old Age Secu

In [9]:
corpus_clean = re.sub('[^a-zA-Z0-9\s\.]+', '', corpus).lower()
corpus_clean = re.sub("\s{2,}"," ",corpus_clean)

In [10]:
corpus_clean[:800]

'employment and social development canada esdc works to improve the standard of living and quality of life for all canadians. we do this by promoting a labour force that is highly skilled. we also promote an efficient and inclusive labour market. the government of canada is ready to support workers and their families who are affected by the april 24 2017 u.s. decision to impose duties on canadian softwood lumber products. tab 1 enabling accessibility fund midsized projects tab 2 help design the new canada service corps program tab 3 see what canadians had to say about reducing poverty help design the new canada service corps program see what canadians had to say about reducing poverty benefits canada ei commission wage earners protection program and economic regions. payment dates for recur'

In [11]:
sents = sent_tokenize(corpus_clean)
sents = list(set(sents))
sents = [re.sub('[^a-zA-Z0-9\s]+', '', sent) for sent in sents];len(sents)

148188

In [21]:
print(random.choice(sents))

canadians with disabilities often require disabilityspecific supports such as access to rehabilitation aids and devices


In [22]:
sents_stream = [sent.split() for sent in sents];sents_stream[1]

['start',
 'the',
 'conversations',
 'by',
 'sharing',
 'how',
 'you',
 'feel',
 'and',
 'why',
 'you',
 'think',
 'discussions',
 'are',
 'importantask',
 'how',
 'the',
 'person',
 'feels',
 'about',
 'his',
 'or',
 'her',
 'situation',
 'and',
 'the',
 'future',
 'for',
 'the',
 'first',
 'conversation',
 'set',
 'a',
 'time',
 'and',
 'place',
 'that',
 'is',
 'quiet',
 'private',
 'and',
 'comfortable',
 'ask',
 'yourself',
 'if',
 'complications',
 'arise',
 'what',
 'would',
 'we',
 'require',
 'and',
 'what',
 'would',
 'we',
 'need',
 'to',
 'know',
 'know',
 'the',
 'persons',
 'values',
 'and',
 'wishes']

## Generate N-Grams From Co-Occuring Terms

"default" scoring: <i>from “Efficient Estimaton of Word Representations in Vector Space” by
Mikolov, et. al.: (count(worda followed by wordb) - min_count) * N / (count(worda) * count(wordb)) > threshold`, where N is the total vocabulary size.</i>


"npmi" scoring: <i>normalized pointwise mutual information, from “Normalized (Pointwise) Mutual
Information in Colocation Extraction” by Gerlof Bouma: ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / - ln(prop(worda followed by wordb) where prop(n) is the count of n / the count of everything in the entire corpus.</i>

In [23]:
def generate_n_gram_transformers(stream,n_gram = 3,scoring="default",min_count=5,threshold=10,common_terms=None):
    streams = [stream]    
    grams = [stream]
    for n in range(1,n_gram):
        gram = Phraser(Phrases(streams[-1],scoring=scoring,min_count=min_count,threshold=threshold,common_terms=common_terms))
        streams.append(list(gram[streams[-1]]))
        grams.append(gram)
        
    return grams
        

In [24]:
_,to_bigrams,to_trigrams,to_quadgrams = generate_n_gram_transformers(sents_stream,n_gram=4,
                                                   scoring="default",min_count=30,
                                                   threshold=10,common_terms=stopwords)

In [50]:
ex_sent_stream = "employment and social development canada service canada is serving canadians to have better lives".split()

In [51]:
to_bigrams[ex_sent_stream]

['employment_and_social',
 'development',
 'canada',
 'service_canada',
 'is',
 'serving',
 'canadians',
 'to',
 'have',
 'better',
 'lives']

In [52]:
to_trigrams[to_bigrams[ex_sent_stream]]

['employment_and_social_development',
 'canada',
 'service_canada',
 'is',
 'serving',
 'canadians',
 'to',
 'have',
 'better',
 'lives']

In [53]:
to_quadgrams[to_trigrams[to_bigrams[ex_sent_stream]]]

['employment_and_social_development_canada',
 'service_canada',
 'is',
 'serving',
 'canadians',
 'to',
 'have',
 'better',
 'lives']

In [49]:
quad_stream = list(to_quadgrams[to_trigrams[to_bigrams[sents_stream]]])
tri_stream = list(to_trigrams[to_bigrams[sents_stream]])
quad_sents = [' '.join(sent) for sent in quad_stream]
tri_sents = [' '.join(sent) for sent in tri_stream]

In [59]:
random.choice(quad_stream)

['in',
 'our',
 'consultations',
 'with',
 'small_businesses',
 'the',
 'commitment',
 'demonstrated',
 'by',
 'one',
 'is',
 'particularly',
 'inspiring']

## Word2Vec

In [60]:
from gensim.models.word2vec import Word2Vec

In [61]:
WORD2VEC_EMBEDDING_DIM = 50

In [73]:
model = Word2Vec(quad_stream, size=WORD2VEC_EMBEDDING_DIM, window=12, min_count=30, workers=4,iter=30)

In [81]:
model.wv.most_similar("easier")

[('easy', 0.7573837041854858),
 ('harder', 0.6652645468711853),
 ('difficult', 0.632043719291687),
 ('impossible', 0.5586131811141968),
 ('choices', 0.5579407215118408),
 ('hard', 0.5527390241622925),
 ('dont', 0.5335586667060852),
 ('successful_transition', 0.5261822938919067),
 ('better', 0.5187982320785522),
 ('every_effort', 0.5164369344711304)]

In [82]:
model.wv.save_word2vec_format("./data/word2vec_esdc.vec")

In [86]:
pretrained_embeddings = utils.load_embedding_model("/Users/WASSIMATHIMNI/data/embeddings/glove/glove.6B.100d.txt")

In [88]:
len(model.wv.vocab.keys()),len(pretrained_embeddings)

(10545, 400000)

## Sidecar Approach - Concatenate Corpus Specific Trained Embeddings to General Pretrained Embeddings 

In [89]:
PRETRAINED_DIM = 100

In [90]:
CONCAT_DIM = PRETRAINED_DIM+WORD2VEC_EMBEDDING_DIM
concat_embeddings = {}
for key,vec in pretrained_embeddings.items():
    if key in model.wv.vocab:
        concat_embeddings[key] = np.hstack((vec,model.wv[key]))
    else:
        concat_embeddings[key] = np.hstack((vec,np.zeros(WORD2VEC_EMBEDDING_DIM)))
for key in model.wv.vocab.keys():
    if key not in concat_embeddings:
        concat_embeddings[key] = np.hstack((np.zeros(PRETRAINED_DIM),model.wv[key]))

In [91]:
"hockey" in concat_embeddings,"service_canada" in concat_embeddings,

(True, True)

In [93]:
concat_embeddings["dog"]

array([ 0.30817  ,  0.30938  ,  0.52803  , -0.92543  , -0.73671  ,
        0.63475  ,  0.44197  ,  0.10262  , -0.09142  , -0.56607  ,
       -0.5327   ,  0.2013   ,  0.7704   , -0.13983  ,  0.13727  ,
        1.1128   ,  0.89301  , -0.17869  , -0.0019722,  0.57289  ,
        0.59479  ,  0.50428  , -0.28991  , -1.3491   ,  0.42756  ,
        1.2748   , -1.1613   , -0.41084  ,  0.042804 ,  0.54866  ,
        0.18897  ,  0.3759   ,  0.58035  ,  0.66975  ,  0.81156  ,
        0.93864  , -0.51005  , -0.070079 ,  0.82819  , -0.35346  ,
        0.21086  , -0.24412  , -0.16554  , -0.78358  , -0.48482  ,
        0.38968  , -0.86356  , -0.016391 ,  0.31984  , -0.49246  ,
       -0.069363 ,  0.018869 , -0.098286 ,  1.3126   , -0.12116  ,
       -1.2399   , -0.091429 ,  0.35294  ,  0.64645  ,  0.089642 ,
        0.70294  ,  1.1244   ,  0.38639  ,  0.52084  ,  0.98787  ,
        0.79952  , -0.34625  ,  0.14095  ,  0.80167  ,  0.20987  ,
       -0.86007  , -0.15308  ,  0.074523 ,  0.40816  ,  0.0192

In [94]:
from scipy.spatial import distance

In [132]:
def retrieve_closest_embeddings(word,embeddings,num_results=10):
    keys =  [key for key,_ in embeddings.items()]
    embs =  [emb for _,emb in embeddings.items()]
    

    distances = distance.cdist(embs,[embeddings[word]])
    
    closest_idx = sorted(range(len(distances)),key=lambda k : distances[k])
    
    results = [(keys[idx],distances[idx][0]) for idx in closest_idx[:num_results]]
    return results[1:]

In [133]:
word = "security"

On our trained model

In [134]:
model.wv.most_similar(word)

[('safeguarding', 0.6965300440788269),
 ('holdings', 0.6321510076522827),
 ('privacy_and_security', 0.6195051074028015),
 ('sensitive', 0.6101369261741638),
 ('privacy', 0.5911946296691895),
 ('retention', 0.5894044637680054),
 ('integrity', 0.5829138159751892),
 ('protocols', 0.5738561153411865),
 ('management', 0.5725846886634827),
 ('protection', 0.572011411190033)]

On the pretrained model

In [135]:
retrieve_closest_embeddings(word,pretrained_embeddings)

[('officials', 4.5959739965486675),
 ('military', 4.618754130871209),
 ('personnel', 4.637935129688064),
 ('enforcement', 4.697978849367469),
 ('civilian', 4.737160836506255),
 ('control', 4.789275243116566),
 ('special', 4.791420784860835),
 ('administration', 4.818851646703397),
 ('government', 4.873565346693941)]

on concatenated model

In [136]:
retrieve_closest_embeddings(word,concat_embeddings)

[('safeguarding', 14.322703175552572),
 ('privacy_and_security', 15.323617480407206),
 ('sensitive', 15.462598341415253),
 ('holdings', 15.501257884537067),
 ('safeguard', 15.570697252166168),
 ('social_insurance_register', 15.85231032336952),
 ('risk_management', 15.921606275469456),
 ('monitors', 16.0446182458727),
 ('stewardship', 16.081221737471804)]

# Save Model

In [115]:
with open("/Users/WASSIMATHIMNI/data/embeddings/glove/esdc_glove_150d.vec",'w',encoding="utf-8") as f:
    for key,emb in concat_embeddings.items():
        f.write('{} {}\n'.format(str(key),' '.join([str(num) for num in emb])))