In [1]:
import spacy
import langdetect
import glob
nlp = spacy.load('en')
from nltk.tokenize import sent_tokenize
import pandas as pd
import numpy as np
import re
from gensim.models.phrases import Phrases,Phraser
from nltk.corpus import stopwords
stopwords = list(set(stopwords.words('english')))
import random

"default" scoring: <i>from “Efficient Estimaton of Word Representations in Vector Space” by
Mikolov, et. al.: (count(worda followed by wordb) - min_count) * N / (count(worda) * count(wordb)) > threshold`, where N is the total vocabulary size.</i>


"npmi" scoring: <i>normalized pointwise mutual information, from “Normalized (Pointwise) Mutual
Information in Colocation Extraction” by Gerlof Bouma: ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / - ln(prop(worda followed by wordb) where prop(n) is the count of n / the count of everything in the entire corpus.</i>

In [14]:
def generate_n_gram_transformers(stream,n_gram = 3,scoring="default",min_count=5,threshold=10,common_terms=None):
    streams = [stream]    
    grams = [stream]
    for n in range(1,n_gram):
        gram = Phraser(Phrases(streams[-1],scoring=scoring,min_count=min_count,threshold=threshold,common_terms=common_terms))
        streams.append(list(gram[streams[-1]]))
        grams.append(gram)
        
    return grams
        

In [15]:
_,to_bigrams,to_trigrams,to_quadgrams = generate_n_gram_transformers(sents_stream,n_gram=4,
                                                   scoring="default",min_count=30,
                                                   threshold=10,common_terms=stopwords)

In [16]:
quad_stream = list(to_quadgrams[to_trigrams[to_bigrams[sents_stream]]])
tri_stream = list(to_trigrams[to_bigrams[sents_stream]])
quad_sents = [' '.join(sent) for sent in quad_stream]
tri_sents = [' '.join(sent) for sent in tri_stream]

In [197]:
quad_stream[random.randint(0,len(quad_stream))]

['call_centre',
 'accessibility',
 'is',
 'the',
 'primary',
 'frustration',
 'for',
 'canadians',
 'and',
 'needs',
 'to',
 'be',
 'improved']

## Extract Key Words Using Part of Speech Tagging and Dependency Parsing

In [1655]:
def extract_important_childrens(childrens):
    
    important_childrens = []
    
    child_tokens = [child for child in childrens]
    #print("Checking for : ",tokens)
    if len(child_tokens) == 0: return []
    else:
        for child in child_tokens:
                if child.pos_ in ["NOUN","PROPN"]: 
                    important_childrens.append(child.text)
                important_childrens += extract_important_childrens(child.children)
        
        return important_childrens
    
               
def extract_important_words(sent):
    tokens = nlp(sent);
    
    pos_constraints = ["NOUN","PROPN","ADJ"]
    nsubjs = []
    nsubjs_childrens = []
    objs = []
    objs_childrens = []
    roots = []
    roots_childrens = []
    amods = []

    for token in tokens:
        
        if token.dep_ in ["nsubj","nsubjpass"] and token.pos_ in pos_constraints:#and token.pos_ in ["NOUN","PROPN"]: 

            nsubjs.append(token.text)
            nsubjs_childrens += extract_important_childrens(token.children)


        elif token.dep_ in ["obj","dobj","iobj","pobj"] and token.pos_ in pos_constraints: 
            objs.append(token.text)
            objs_childrens += extract_important_childrens(token.children)
            
        elif token.dep_ in ["ROOT"] and token.pos_ in pos_constraints:
            roots.append(token.text)
            roots_childrens += extract_important_childrens(token.children)
            
            
    return list(set(nsubjs+nsubjs_childrens+objs+objs_childrens+roots+roots_childrens))

# if important words are empty, we fall back on vectorizing all the words
def important_words_to_vec(sent):
    important_words = extract_important_words(sent)
        
    vs = np.zeros(100)
    num_words = 0    
    words_to_iterate = important_words if len(important_words) > 0 else [token.text for token in nlp(sent)]
    for word in words_to_iterate: 
        if word in model:
            num_words+=1
            vs = np.add(vs,model[word])

    if num_words > 0: vs = np.divide(vs, num_words)
    return vs

def print_tokens_info(sent):
    tokens = nlp(sent)
    for token in tokens: 
        print(token.text,token.pos_,token.dep_,token.head)
        


In [1688]:
idx = random.randint(0,len(quad_stream)) 
statement = quad_sents[idx];statement

'lowincome individuals may qualify for additional benefits such as the s are collected to confirm the amount of canada_pension_plan benefits date_of_birth marital_status and date of death'

In [1689]:
extract_important_words(statement)

['s', 'canada_pension_plan', 'benefits', 'individuals', 'amount', 'death']

In [1657]:
print_tokens_info(statement)

a DET det tool
selfserve NOUN compound tool
tool NOUN ROOT tool
that ADJ nsubj gives
gives VERB relcl tool
you PRON dative gives
a DET det list
customized ADJ amod list
list NOUN dobj gives
of ADP prep list
federal_and_provincial ADJ amod programs_and_services
or CCONJ cc federal_and_provincial
territorial ADJ conj federal_and_provincial
programs_and_services NOUN pobj of
for ADP prep be
which ADJ pobj for
you PRON nsubj be
may VERB aux be
be VERB relcl programs_and_services
eligible ADJ acomp be


In [1284]:
important_words_to_vec(statement)

array([-0.42334065, -0.39303927, -1.68439957,  0.75216232,  0.801535  ,
       -0.08767149, -0.37712129,  0.34931072, -0.30723678,  0.00643667,
       -0.83301665,  0.79181074,  0.54243128, -1.0890391 , -0.77291706,
        0.15547569,  0.47596643,  0.8113825 , -0.18547491,  0.91664824,
        0.89538675, -0.17433961, -0.30484614, -0.60799384, -0.42346805,
       -0.39190262, -0.5280691 , -0.17095338,  0.17808194,  0.30573152,
       -0.01844273,  0.12201561,  1.16041741, -0.26933178, -0.49526895,
       -1.91150582,  0.92996629, -0.09747603, -0.1720103 ,  0.37293532,
        0.72227428, -0.78694876, -0.08724567,  0.33885501, -0.41784123,
       -0.16138673, -0.16985127, -1.25440089,  1.01665209,  1.91380348,
        1.36372462,  0.1645832 ,  0.11790471,  1.12049412, -0.59503786,
        0.02672683,  0.30838578,  0.45770224, -0.55125607, -0.56056036,
        0.18201169, -1.49064491, -0.55765053, -0.60381844,  0.0585424 ,
        0.229584  ,  1.3475775 , -0.56016631, -0.03973048,  0.23

## MeanShift+LexRank Authors ALEXxWASSIM to be published

In [1579]:
text = """ESDC delivers a range of programs and services that affect Canadians throughout their lives. The Department provides seniors with basic income security, supports unemployed workers, helps students finance their post-secondary education and assists parents who are raising young children. The Labour Program contributes to social and economic well-being by fostering safe, healthy, fair and inclusive work environments and cooperative workplace relations in the federal jurisdiction. Service Canada helps citizens access ESDC's programs, as well as other Government of Canada programs and services.
In particular, the Department is responsible for delivering over $120 billion in benefits directly to individuals and organizations through such Government of Canada programs and services as Employment Insurance, Old Age Security, the Canada Pension Plan and the Canada Student Loans Program. The Department also provides $1.8 billion in funding to other orders of government, educators and organizations in the voluntary and private sectors.
To fulfill its mission, the Department is responsible for:
developing policies that ensure all can use their talents, skills and resources to participate in learning, work and their community;
delivering programs that help Canadians move through life's transitions, from school to work, from one job to another, from unemployment to employment, from the workforce to retirement;
providing income support to seniors, families with children and Employment Insurance beneficiaries;
fostering inclusive growth by providing opportunity and assistance to Canadians with distinct needs, such as Indigenous people, people with disabilities, homeless people and recent immigrants;
overseeing labour relations, occupational health and safety, labour standards, employment equity and workers' compensation in the federal jurisdiction; and
delivering programs and services on behalf of other departments and agencies, such as passport services delivered on behalf of Immigration, Refugees and Citizenship Canada and services to veterans delivered on behalf of Veterans Affairs Canada.
ESDC assisted millions of Canadians in 2015-2016
There were 78.5 million visits to the Service Canada website.
Over 2 million calls were answered by 1 800 O-Canada agents.
There were 8.7 million in-person visits to Service Canada Centres.
4.6 million passports were issued.
2.95 million applications were processed for Employment Insurance (initial and renewal); 690,000 for the Canada Pension Plan; 775,000 for Old Age Security.
24.7 million payments were issued for Employment Insurance (initial and renewal); 64.4 million for the Canada Pension Plan; 68.5 million for Old Age Security.
18.6 million Employment Insurance enquiries and 3.3 million enquiries related to the Canada Pension Plan and Old Age Security were resolved in the Interactive Voice Response system.
Service Canada Call Centre agents answered 3.4 million Employment Insurance calls, 2.5 million Canada Pension Plan and Old Age Security calls and 500,000 calls related to employer services.
640,000 full-time post-secondary students received federal student financial assistance, which includes students who received a Canada Student Loan, a Canada Student Grant and/or those who benefited from an in-study interest subsidy.
$3.27 billion was withdrawn from Registered Education Savings Plans for 395,027 students to help fund their post-secondary education.
94 percent of labour disputes in federally regulated workplaces were settled without a work stoppage as part of the collective bargaining process.
98.9 percent of initial Wage Earner Protection Program payments and non-payment notifications were issued within the 42-day service standard.
Included in these core roles are responsibilities for the design and delivery of some well-known Government of Canada programs and services:
Old Age Security;
the Canada Pension Plan;
Employment Insurance;
the Canada Student Loans and Grants and Canada Apprentice Loans Program;
the Canada Education Savings Program;
the Wage Earner Protection Program; and
passport services.
Service standards
For 2017-2018, the following are our key service commitments:
80% of EI benefit payments or non-payment notifications issued within 28 days of filing
90% of OAS basic benefits paid within the first month of entitlement
90% of CPP retirement benefits paid within the first month of entitlement
80% of CPP Disability initial application decisions made within 120 calendar days of receipt of a completed application
80% of EI, CPP, OAS and Employer Contact Centre calls answered by an agent within 10 minutes
95% payment accuracy for EI, CPP and OAS
90% of grants and contributions proposals are acknowledged within 21 calendar days of receiving an application package
90% of contribution payments are processed within 28 calendar days of receiving a completed claim package
90% of first installment grant payments processed no later than 15 calendar days after the approved project start date
90% of passports issued on time
Direct benefits to Canadians are part of Canada's social safety net and represent 95 percent of the Department's expenditures.
Through the Labour Program, the Department contributes to the well-being of working Canadians by providing labour relations mediation services, enforcing minimum working conditions, promoting decent work and fostering respect for international labour standards.
Through Service Canada, the Department helps Canadians access departmental programs as well as other Government of Canada programs and services at 589 in-person points of service across the country (555 Service Canada points of service, 2 consolidated offices with a Passport office and 32 stand-alone Passport offices). In addition to in-person services, the organization also serves the needs of Canadians online at Canada.ca, through My Service Canada Account and by telephone through 1 800 O-Canada and its network of call centres."""

In [1580]:
documents = sent_tokenize(text)
documents = list(set(sents))
documents = [re.sub('[^a-zA-Z0-9\s]+', '', sent).lower() for sent in sents];len(sents)

24

In [1581]:
document_vecs = [important_words_to_vec(sent) for sent in documents]

## From Scratch Approach

def gaussian(d,bw):
    return np.exp(-0.5*((d/bw))**2 / (bw*math.sqrt(2*math.pi)))

# (bw == kernel width) should cover 1/3 of your data
def meanshift(data,bw=1,iterations = 5):
    X = np.copy(data)
    for it in range(iterations):
        for i,x in enumerate(X):
            dist = np.sqrt((x-X)**2).sum(1)
            weight = gaussian(dist,bw)
            X[i] = (np.expand_dims(weight,1)*X).sum(0) / weight.sum()
    return X

def get_unique_vecs(x):
    seen = set()
    centroids = []
    for item in x:
        t = tuple(item)
        if t not in seen:
            centroids.append(item)
            seen.add(t)

    return centroids


bandwidth = estimate_bandwidth(document_vecs, quantile=0.33)

mean_shifted = meanshift(document_vecs,bw=bandwidth,iterations=15);
len(get_unique_vecs(mean_shifted))

## Sklearn Approach

In [1582]:
from sklearn.cluster import MeanShift,estimate_bandwidth
import math
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from gensim.summarization.summarizer import summarize

In [1583]:
def group_documents_by_similarity(docs,bandwidth_quantile=0.3):
    # Returns a tuple 
    # 1.the labels to the list of document ids
    # 2. the vectors grouped by labels
    
    bandwidth = estimate_bandwidth(docs, quantile=bandwidth_quantile)
    if bandwidth == 0: print("Not enough documents to separate them.")
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=False)
    ms.fit(docs)
    num_centroids = len(ms.cluster_centers_)

    labels_to_ids = [set() for _ in range(num_centroids)]
    for doc_id,label in enumerate(ms.labels_): labels_to_ids[label].add(doc_id)
        
    doc_vec_labeled = np.array([(vec,l) for vec,l in zip(docs,ms.labels_)])
    docs_grouped = [doc_vec_labeled[doc_vec_labeled[:,1] == i,0] for i in range(num_centroids)]
    return labels_to_ids,docs_grouped

In [1590]:
label_to_ids,docs_grouped = group_documents_by_similarity(document_vecs,bandwidth_quantile=0.3);len(label_to_ids)

5

In [1591]:
doc_parts = [np.array(documents)[list(ids)] for _,ids in enumerate(label_to_ids)]
doc_parts = [". ".join(doc_parts[label])+"." for label,_ in enumerate(label_to_ids)]

In [1592]:
bullet_points = []
for text in doc_parts:
    num_sentences = len(sent_tokenize(text))
    if(num_sentences <= 1):
        bullet_points.append(text)
    else:
        summarize_1_sent_ratio = int(100*float(1/num_sentences))/100
        lead_sents = summarize(text,ratio=summarize_1_sent_ratio,split=True)
        if len(lead_sents) > 0:
            bullet_points.append(summarize(text,ratio=summarize_1_sent_ratio,split=True)[0])
            
    
for point in bullet_points:
    print(point+"\n")

in particular the department is responsible for delivering over 120 billion in benefits directly to individuals and organizations through such government of canada programs and services as employment insurance old age security the canada pension plan and the canada student loans program.

the department provides seniors with basic income security supports unemployed workers helps students finance their postsecondary education and assists parents who are raising young children.

over 2 million calls were answered by 1 800 ocanada agents.

327 billion was withdrawn from registered education savings plans for 395027 students to help fund their postsecondary education.

640000 fulltime postsecondary students received federal student financial assistance which includes students who received a canada student loan a canada student grant andor those who benefited from an instudy interest subsidy.

