In [1]:
import spacy
import langdetect
import glob
nlp = spacy.load('en')
from nltk.tokenize import sent_tokenize
import pandas as pd
import numpy as np
import re
from gensim.models.phrases import Phrases,Phraser
from nltk.corpus import stopwords
stopwords = list(set(stopwords.words('english')))
import random

In [2]:
crawled = pd.read_csv("data/crawl_data(7379 pages - employment-social-development).csv").drop_duplicates(subset=["text"], keep="first")
crawled["lang"] = crawled.text.apply(lambda x: langdetect.detect(str(x)))
crawled = crawled[crawled["lang"] == "en"].text;len(crawled)

2941

In [3]:
crawled.tail()

7370    Status report on Transformational and Major Cr...
7373    Examples for independent workers or profession...
7374    User Fees, Regulatory Charges and External Fee...
7377    Consolidated Financial Statements (Unaudited) ...
7378    Annex to the Statement of management responsib...
Name: text, dtype: object

In [4]:
program_files = glob.glob('data/program_descriptions/*.csv')
programs = pd.concat([pd.read_csv(file) for file in program_files]).reset_index()[["Program","Description"]]
len(programs)

42

In [5]:
programs.tail()

Unnamed: 0,Program,Description
37,Canada Child Benefit,The Canada child benefit (CCB) is a tax-free m...
38,Early Learning and Child Care,Early learning and child care needs across the...
39,Indigenous Early Learning and Child Care,The Government of Canada believes that all Can...
40,Canadian poverty reduction,The Government of Canada is committed to devel...
41,Social Innovation and Social Finance,Social innovation is about developing new solu...


In [6]:
crawled_corpus = crawled.str.cat(sep=" ")
programs_corpus = programs.Description.str.cat(sep=" ")
corpus = crawled_corpus + programs_corpus

In [7]:
corpus[:4000]

"Employment  and Social Development Canada (ESDC) works to improve the standard of living  and quality of life for all Canadians. We do this by promoting a labour force  that is highly skilled. We also promote an efficient and inclusive labour  market. The Government of Canada is ready to support workers and their families who are affected by the April 24, 2017, U.S. decision to impose duties on Canadian softwood lumber products. Tab 1: Enabling Accessibility Fund: mid-sized projects Tab 2: Help design the new Canada Service Corps program Tab 3: See what Canadians had to say about reducing poverty Help design the new Canada Service Corps program See what Canadians had to say about reducing poverty Benefits, Canada EI Commission, Wage Earners Protection Program, and economic regions. Payment dates for recurring Government of Canada benefit payments. Job opportunities, work permits, Social Insurance Number, criminal record checks and security clearances. Canada Pension Plan, Old Age Secu

In [8]:
corpus_clean = re.sub('[^a-zA-Z0-9\s\.]+', '', corpus).lower()

In [9]:
corpus_clean[:800]

'employment  and social development canada esdc works to improve the standard of living  and quality of life for all canadians. we do this by promoting a labour force  that is highly skilled. we also promote an efficient and inclusive labour  market. the government of canada is ready to support workers and their families who are affected by the april 24 2017 u.s. decision to impose duties on canadian softwood lumber products. tab 1 enabling accessibility fund midsized projects tab 2 help design the new canada service corps program tab 3 see what canadians had to say about reducing poverty help design the new canada service corps program see what canadians had to say about reducing poverty benefits canada ei commission wage earners protection program and economic regions. payment dates for r'

In [10]:
sents = sent_tokenize(corpus_clean)
sents = list(set(sents))
sents = [re.sub('[^a-zA-Z0-9\s]+', '', sent) for sent in sents];len(sents)

152047

In [1438]:
sents[100]

'hardship includes but is not confined to circumstances of personal destitution emergency or disaster'

In [12]:
#sents_clean = [re.sub('[^a-zA-Z0-9\s]+', '', sent).lower() for sent in sents];sents_clean[0]

In [13]:
sents_stream = [sent.split() for sent in sents];sents_stream[1]

['its',
 'important',
 'to',
 'note',
 'that',
 'of',
 'the',
 'online',
 'survey',
 'respondents',
 '1005',
 'identified',
 'as',
 'female',
 'and',
 'only',
 '200',
 'identified',
 'as',
 'male']

"default" scoring: <i>from “Efficient Estimaton of Word Representations in Vector Space” by
Mikolov, et. al.: (count(worda followed by wordb) - min_count) * N / (count(worda) * count(wordb)) > threshold`, where N is the total vocabulary size.</i>


"npmi" scoring: <i>normalized pointwise mutual information, from “Normalized (Pointwise) Mutual
Information in Colocation Extraction” by Gerlof Bouma: ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / - ln(prop(worda followed by wordb) where prop(n) is the count of n / the count of everything in the entire corpus.</i>

In [14]:
def generate_n_gram_transformers(stream,n_gram = 3,scoring="default",min_count=5,threshold=10,common_terms=None):
    streams = [stream]    
    grams = [stream]
    for n in range(1,n_gram):
        gram = Phraser(Phrases(streams[-1],scoring=scoring,min_count=min_count,threshold=threshold,common_terms=common_terms))
        streams.append(list(gram[streams[-1]]))
        grams.append(gram)
        
    return grams
        

In [15]:
_,to_bigrams,to_trigrams,to_quadgrams = generate_n_gram_transformers(sents_stream,n_gram=4,
                                                   scoring="default",min_count=30,
                                                   threshold=10,common_terms=stopwords)

In [16]:
quad_stream = list(to_quadgrams[to_trigrams[to_bigrams[sents_stream]]])
tri_stream = list(to_trigrams[to_bigrams[sents_stream]])
quad_sents = [' '.join(sent) for sent in quad_stream]
tri_sents = [' '.join(sent) for sent in tri_stream]

In [197]:
quad_stream[random.randint(0,len(quad_stream))]

['call_centre',
 'accessibility',
 'is',
 'the',
 'primary',
 'frustration',
 'for',
 'canadians',
 'and',
 'needs',
 'to',
 'be',
 'improved']

## Word2Vec

In [26]:
from gensim.models.word2vec import Word2Vec

In [27]:
model = Word2Vec(quad_stream, size=100, window=10, min_count=20, workers=4)

In [1133]:
model.wv.most_similar("provinces")

[('jurisdictions', 0.7630330920219421),
 ('regions', 0.7117319107055664),
 ('atlantic_provinces', 0.6717230081558228),
 ('countries', 0.6685596108436584),
 ('declines', 0.6668083071708679),
 ('territories', 0.6506251692771912),
 ('largest', 0.6280949115753174),
 ('western_provinces', 0.6234663724899292),
 ('sectors', 0.6096717119216919),
 ('industries', 0.6068037152290344)]

In [126]:
model.save("models/word2vec_esdc.vec")