In [1]:
def get_document_text(raw_text):
    """ This function takes in raw document text as input which we receive from the API and returns a clean text 
    of the associated document. It cleans up any HTML code in the text, newline characters, and extracts supplemental
    information part of the document.
    
    INPUT: string
    OUTPUT: string
    """
    raw_text = raw_text.replace('\n',' ')
    raw_text = raw_text.replace('*','') # added
    raw_text = raw_text.replace('\r',' ') # added
    supp_info_idx = raw_text.find("SUPPLEMENTARY INFORMATION:")
    summary_idx = raw_text.find("SUMMARY:")
    dates_idx = supp_info_idx = raw_text.find("DATES:")
    suppl_info = raw_text[supp_info_idx+26:] # To leave out the string 'Supplementary Information'
    summary = raw_text[summary_idx:dates_idx]
    # Remove any residual HTML tags in text
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', suppl_info)
    cleansummary = re.sub(cleanr, '', summary)
    return cleantext, cleansummary

def get_keywords(clean_corpus):
    """ This function takes in a clean corpus as input and extracts most important keywords and top 10% of relevant 
    sentences from the text.
    
    INPUT: string
    OUTPUT: List of tuples: [(list_of_keywords,list_of_sentences)]
    """
    
    tagged_tokens = tag_my_text(tokenize_text_sent(clean_corpus))
    grand_list = get_top_np(noun_phrase_finder(tagged_tokens))
    
    return grand_list

def tokenize_text(corpus):
    pattern = r'''(?x)    # set flag to allow verbose regexps
    (([A-Z]\.)+)       # abbreviations, e.g. B.C.
    |(\w+([-']\w+)*)       # words with optional internal hyphens e.g. after-ages or author's
    '''
    tokens = nltk.regexp_tokenize(corpus,pattern)
    all_token = [word.lower() for token in tokens for word in token if word != "" 
                 and word[0] != "'" and word[0] != "-"]
    return all_token

def tokenize_text_sent(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences    
    return [tokenize_text(sent) for sent in raw_sents]

def tag_my_text(sents):
    return [nltk.pos_tag(sent) for sent in sents]

#Chunk noun phrases in tree 
def noun_phrase_chunker():
    grammar = r"""
    NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
    """
    cp = nltk.RegexpParser(grammar)
    return cp

#Extract only the NP marked phrases from the parse tree, that is the chunk we defined
def noun_phrase_extractor(sentences, chunker):
    res = []
    for sent in sentences:
        tree = chunker.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'NP' : 
                res.append(subtree[0:len(subtree)])
                #res.append(subtree[0])
                #print(subtree)
    return res

def noun_phrase_finder(tagged_text):
    all_proper_noun = noun_phrase_extractor(tagged_text,noun_phrase_chunker()) 
    #does not literally mean proper noun. Chunker only extracts common noun
    noun_phrase_list = []                                                      
    #noun_phrase_string_list =[]
    for noun_phrase in all_proper_noun:
        if len(noun_phrase) > 0: #this means where the size of the phrase is greater than 1
            small_list =[]
            for (word,tag) in noun_phrase:
                small_list.append(word)
            noun_phrase_list.append(small_list)
            #noun_phrase_string_list.append(' '.join(small_list))
    return noun_phrase_list

#get frequency dist of different length in all the noun phrases extracted. 
#Something of the form {1:45,2:23} - how many 1phrased and 2 phrased chunks I have etc.
def get_length_np(nounPhrase):
    np_length={}
    for inner_np in nounPhrase:
        np_length[len(inner_np)] = np_length.get(len(inner_np),0) + 1
    return np_length

#get freq dist obj for noun phrase of different lengths
def find_freq(nested_list,nest_len):
    #from nltk.probability import FreqDist
    fdist_list =[]
    for inner_np in nested_list:
        if len(inner_np) == nest_len:
            fdist_list.append(' '.join(inner_np))
    fdist = FreqDist(fdist_list)
    return fdist

#make a grand list of top occuring noun phrases of different sizes
def get_top_np(np):
    master_common_list=[]
    len_list =get_length_np(np).keys()
    for item in len_list:
        fdist_np = find_freq(np,item)
        top = fdist_np.most_common(15) 
        top_list = []
        for w,c in top:
            if c >= 1: # changed to 1 from 10
#                 print (w)
                top_list.append((w,c))
                #top.remove((w,c))
        if len(top_list) > 0:
            master_common_list.append(top_list)
    return master_common_list

In [3]:
def get_top_sents(corpus,keywords_list):
    sentence_list = get_sentences(corpus)
    indexed_sents = sentence_indexing(sentence_list) # This is so that we can re-order most relevant sentences later
    
    sentence_length_scores = get_sentence_lengths(sentence_list)
    keyphrase_scores = get_keyphrase_scores(corpus,sentence_list, keywords_list)
    
    sent_scores = [s+c for s,c in zip(sentence_length_scores,keyphrase_scores)]
    idx_sent_scores = [(s,c) for s,c in zip(indexed_sents,sent_scores)]
    sorted_sents = sorted(idx_sent_scores,key=lambda sent: sent[1],reverse=True)
    
    # Keep top 10% of the sentences, or top 10 whichever is less
    top_10 = int(len(sorted_sents) * 0.1)
    if top_10 > 10:
        top_10 = 10
    x = sorted_sents[:top_10]
    top_list = [item[0] for item in x]
    sorted_top_list = sorted(top_list,key=lambda sent:sent[1],reverse=False)
    sorted_top_list = [sent[0] for sent in sorted_top_list]
    
    return sorted_top_list
    
def get_sentences(corpus):
    # First, tokenize the corpus into sentences
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus)
    
    return raw_sents

def sentence_indexing(sent_list):
    indexed_sents = []
    for idx,sent in enumerate(sent_list):
        indexed_sents.append((sent,idx))
    return indexed_sents    

def get_sentence_lengths(sent_list):
    sent_length = []
    for s in sent_list:
        sent_length.append(len(s.split(' ')))
    
    return sent_length

def get_keyphrase_scores(corpus,sent_list, keywords):
    #keywords = get_keywords(corpus) # This gives us a list containing unigrams at index 0 and bigrams at index 1,etc
    
    unigrams = [item[0] for item in keywords[0]]
    bigrams = [item[0] for item in keywords[1]]

    unigram_scores = get_unigram_scores(unigrams,sent_list)
    bigram_scores = get_bigram_scores(bigrams,sent_list)

    sent_feature_import = [a+b for a,b in zip(unigram_scores,bigram_scores)]
    
    return sent_feature_import

def get_unigram_scores(unigram_list,sent_list):
    occurence_list = []
    for s in sent_list:
        words = s.split(' ')
        occurence_count = 0
        for w in words:
            if w.lower() in unigram_list or w.lower() in ['complaint','concern','documented','evidence','warn']:
                occurence_count += 1
        occurence_list.append(occurence_count)
        
    return occurence_list

def get_bigram_scores(bigram_list,sent_list):
    occurence_list = []
    for s in sent_list:
        # create bigrams
        token=nltk.word_tokenize(s)
        bigram_phrases = ngrams(token,2)
        occurence_count = 0
        for w in bigram_phrases:
            w = [word.lower() for word in w]
            if ' '.join(w) in bigram_list:
                occurence_count += 1
        occurence_list.append(occurence_count)
        
    return occurence_list

In [4]:
def extract_summary(text):
    clean_text, clean_summary = get_document_text(text)
    keywords = get_keywords(clean_text)
    top = get_top_sents(clean_text,keywords)
    
    return keywords[0],top, clean_summary

In [5]:
# Imports
from pickle import dump, load
import nltk
from nltk import word_tokenize,FreqDist
import re
from nltk.corpus import wordnet as wn
from nltk.util import ngrams

In [6]:
doc_list =load(open("data/Master2_doc_content",'rb'))
len(doc_list)

6

In [7]:
# Start working on one document and associated comments
document = doc_list[0]
document.keys()

dict_keys(['comment_list', 'text'])

In [8]:
# Convert bs4 ResultSet to a list of strings
comments = []
for c in document['comment_list']:
    c = c.replace('\n',' ')
    comments.append(str(c))

In [9]:
len(comments)

1290

In [10]:
# Modified from Brandon Rose:
def tokenize_text(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

def stem_text(text):
    tokens = tokenize_text(text)
    stems = [stemmer.stem(t) for t in tokens]
    return stems

In [11]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

Switched from stemming to just tokenizing due to sentiment analysis needs.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer(tokenizer=tokenize_text,
                            stop_words='english',
                            ngram_range=(1,3),
                            min_df=0.2, max_df=0.8,
                            max_features=200000)

In [13]:
tfidf_matrix = tfidf_vec.fit_transform(comments)

### Latent Semantic Analysis

In [14]:
# Adapated from http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#sphx-glr-auto-examples-text-document-clustering-py
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

num_components = 20
svd = TruncatedSVD(num_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

lsa_tfidf_matrix = lsa.fit_transform(tfidf_matrix)

In [15]:
from sklearn.cluster import KMeans

num_clusters = 12
km = KMeans(n_clusters=num_clusters)
km.fit(lsa_tfidf_matrix)
clusters = km.labels_.tolist()

In [16]:
from sklearn.metrics.pairwise import paired_distances

cluster_center_list = []
for c in clusters:
    cluster_center_list.append(km.cluster_centers_[c])

center_distances = paired_distances(lsa_tfidf_matrix, cluster_center_list)

In [17]:
import pandas as pd
comment_clusters = {'comment': comments, 'cluster': clusters, 'dist': center_distances}
comment_frame = pd.DataFrame(comment_clusters, index = [clusters] , columns = ['comment', 'cluster', 'dist'])

In [18]:
comment_frame['cluster'].value_counts()

1     296
0     130
4     126
5     107
3     107
7      97
2      85
8      83
11     76
6      72
10     62
9      49
Name: cluster, dtype: int64

In [19]:
print(comment_frame[comment_frame.cluster==3].max())
print()
print(comment_frame[comment_frame.cluster==3].min())
print()
print(comment_frame[comment_frame.cluster==5].min())
print()
print(comment_frame[comment_frame.cluster==3])

comment    u PublicHealth Pre.ent. Promale. ProŠt. Health...
cluster                                                    3
dist                                                0.850071
dtype: object

comment                                          January ...
cluster                                                    3
dist                                                0.368073
dtype: object

comment                                                  ...
cluster                                                    5
dist                                                0.345464
dtype: object

                                              comment  cluster      dist
3   I strongly encourage all public housing agency...        3  0.460829
3   RE: Docket number is FR 5597-P-02.  Title "Ins...        3  0.585358
3   Thank you for considering than ban of smoking ...        3  0.678159
3   As an employee of a local hospital and a coord...        3  0.472842
3   As an apartment dweller and nonsmoker

Not going to summarize long comments as, while some comments are very long, understanding the comment as a whole is necessary for officials to take it into account when assessing the relevance to the regulation.

In [20]:
print('Most Central Comments by Cluster\n')
for i in range(num_clusters):
    print('Cluster {}\n'.format(i))
    if len(comment_frame[comment_frame.cluster==i].min().comment) < 2000:
        print(comment_frame[comment_frame.cluster==i].min().comment)
    else:
        print('Long Comment, Extracted Summary Below:')
        print(extract_summary(comment_frame[comment_frame.cluster==i].min().comment))
    print()
    print()

Most Central Comments by Cluster

Cluster 0

Long Comment, Extracted Summary Below:
([('smoking', 8), ('http', 7), ('tobacco', 6), ('smoke', 5), ('research', 4), ('exposure', 4), ('health', 4), ('support', 3), ('disease', 3), ('housing', 3), ('prevention', 3), ('cessation', 3), ('asthma', 3), ('directorate', 3), ('control', 2)], ["t, NE         Washington, DC 20002 - 4242       (202) 336 - 5500           Web: www.apa.org   (202) 336 - 6123 TDD     January 19, 2016     The Honorable Julián Castro   Secretary   Department of Housing and Urban Development   451 Seventh Street SW   Washington, DC 20410     Dear Secretary Castro:     The American Psychological Association (APA), the largest association of psychologists in the  U.S., works  to benefit society and improve people's lives through research, education, practice,  and advocacy.", 'We have a l ong - standing commitment   to supporting programs and policies to  improve the health and well - being of vulnerable and underserved popula

In [21]:
comment_frame[comment_frame.cluster==4].min().comment

'                                https://www.facebook.com/groups/WP4HE/                           January 15, 2016       Office of the Assistant Secretary for  Public and Indian Housing   U.S. Department of Housing and Urban Development     RE: Docket Number FR 5597 - P - 02   Instituting Smoke - Free Public Housing     To Whom  It   May Concern:     On behalf of the Westlawn Partnership for a Healthier Environment, I am pleased to pr ovide a  letter of support for the proposed rule to require all Public Housing Authorities to implement  smoke - free policies (HUD Docket No. FR 5597 - P - 02).     The Westlawn Partnership for a Healthier Environment  is a group of community stakeholders  that   has met on a regular basis since 2008 to identify, prioritize, and address environmental  health concerns in the Westlawn community.  Located in northwest Milwaukee, Westlawn is   - subsidized housing development.     Milwauke e County ranks as the worst county in the State of Wisconsin for asth

In [22]:
svd.components_

array([[  1.51909149e-01,   1.01101019e-01,   6.84215862e-02,
          1.41145757e-01,   2.38781606e-01,   4.08733202e-01,
          1.63644264e-01,   9.60627460e-02,   1.47230567e-01,
          1.46792224e-01,   1.32681379e-01,   1.03908592e-01,
          2.91388865e-01,   2.27132528e-01,   1.57364111e-01,
          2.13082346e-01,   1.12697773e-01,   1.01453446e-01,
          4.16420438e-01,   1.17878488e-01,   1.65654929e-01,
          1.11557806e-01,   3.23779989e-01,   9.03155163e-02,
          1.67889410e-01,   8.22990130e-02,   7.47613766e-02],
       [  1.29595712e-01,   2.21272121e-02,  -8.34943412e-02,
          1.04901620e-01,  -1.45060460e-02,  -2.78433951e-01,
         -7.00143683e-02,   1.11508454e-01,   2.77097917e-01,
         -6.96803612e-02,  -2.22224711e-01,  -2.04731570e-01,
         -2.44734923e-01,  -2.20270119e-01,  -6.86751186e-02,
         -2.77975573e-01,   1.38966311e-02,   1.75285978e-02,
          5.54107074e-01,   1.04979433e-01,  -2.94224809e-01,
       

### K Means

In [23]:
from sklearn.cluster import KMeans

num_clusters = 12
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [24]:
from sklearn.metrics.pairwise import paired_distances

cluster_center_list = []
for c in clusters:
    cluster_center_list.append(km.cluster_centers_[c])

center_distances = paired_distances(tfidf_matrix, cluster_center_list)

In [25]:
import pandas as pd
comment_clusters = {'comment': comments, 'cluster': clusters, 'dist': center_distances}
comment_frame = pd.DataFrame(comment_clusters, index = [clusters] , columns = ['comment', 'cluster', 'dist'])

In [26]:
comment_frame['cluster'].value_counts()

1     297
2     141
6     132
4     125
3     102
5      94
8      91
9      88
10     85
11     69
7      41
0      25
Name: cluster, dtype: int64

In [27]:
print(comment_frame[comment_frame.cluster==3].max())
print()
print(comment_frame[comment_frame.cluster==3].min())
print()
print(comment_frame[comment_frame.cluster==5].min())
print()
print(comment_frame[comment_frame.cluster==3])

comment    We would like to see this rule passed
cluster                                        3
dist                                    0.921081
dtype: object

comment            January  13, 201 6  The Honorable Juli...
cluster                                                    3
dist                                                0.434169
dtype: object

comment                                          January ...
cluster                                                    5
dist                                                0.385933
dtype: object

                                              comment  cluster      dist
3   This comment addresses the possibility of limi...        3  0.628716
3   The Cook County Department of Public Health of...        3  0.805541
3   This is a Comment on the Department of Housing...        3  0.566029
3   The U.S. Department of Housing and Urban Devel...        3  0.472694
3   The Martin Housing Authority in Martin, SD is ...        3  0.543663
3   

In [28]:
print('Most Central Comments by Cluster\n')
for i in range(num_clusters):
    print('Cluster {}\n'.format(i))
    print(comment_frame[comment_frame.cluster==i].min().comment)
    print()

Most Central Comments by Cluster

Cluster 0

  January 19, 2016       The Honorable Julián Castro   Secretary   Department of Housing and Urban Development   451 Seventh Street SW   Washington, DC 20410   Re: Instituting Smoke - Free Public H ousing; Docket No. FR 5597 - P - 02     Dear Secretary Castro:     On behalf of the Advocacy Council of the American College of Allergy, Asthma and  Immunology, we are pleased to submit this letter of support for the  proposed rule published in  the  Federal Register   on November 17, 2015 (Docket No. FR - 5597 - P - 02) on instituti ng smoke - free public housing.       Immunology, represents the interests of over 6,000 allergists/immunologists and allied health  professionals.  Its members are at the   front line of caring for individuals with asthma which  disproportionately impacts those living in urban areas, especially children.       Our organization   support s   th e proposed new rule  as we believe it could significantly  improve the  he

In [29]:
# Modified from Brandon Rose and
# http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
def vocabulary_frame(text):
    tokens = tokenize_text(text)
    stems = stem_text(text)
    return pd.DataFrame({'words': tokens}, index = stems).drop_duplicates()

def extended_vocabulary_frame(texts):
    frames = []
    for t in texts:
        vf = vocabulary_frame(t)
        frames.append(vf)
    extended = pd.concat(frames).drop_duplicates()
    return extended

def km_print_top_words(model, num_clusters, vocab_frame, feature_names, n_top_words):    
    print("Top terms per cluster:\n")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1] 
    for i in range(num_clusters):
        print("Cluster %d Words:" % i, end=' ')
        print(', '.join(vocab_frame.ix[feature_names[ind].split(' ')].values.tolist()[0][0]
                        for ind in order_centroids[i, :n_top_words]))
        print()

In [30]:
feature_names = tfidf_vec.get_feature_names()
comment_vf = extended_vocabulary_frame(comments)
km_print_top_words(km, num_clusters, comment_vf, feature_names, 6)

Top terms per cluster:

Cluster 0 Words: 

TypeError: sequence item 2: expected str instance, float found

### LDA

In [21]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation()
lda.fit(tfidf_matrix)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=10, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [22]:
# Modified from:
# http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
def lda_print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

In [23]:
lda_print_top_words(lda, feature_names, 20)

Topic 0:
right, 's, smoke-fre, children, smoke, attach, make, health, build, peopl, tobacco, hous, propos rule, secondhand, unit, rule, cigarett, becaus, public, public hous

Topic 1:
smoke, hous, comment, ban, propos, hud, depart, public, rule, allow, use, public hous, propos rule, pleas, free, provid, develop, unit, health, institut

Topic 2:
smoke, health, hous, public, tobacco, secondhand, secondhand smoke, public hous, rule, polici, protect, includ, resid, children, cigarett, use, smoke-fre, hud, propos, live

Topic 3:
pleas, ban, smoke, hous, ani, public hous, smoker, public, make, health, secondhand, rule, unit, smoke-fre, mani, live, resid, polici, apart, provid

Topic 4:
smoke, live, peopl, smoker, apart, right, hous, like, 's, ban, home, build, public, year, becaus, cigarett, rule, health, make, allow

Topic 5:
implement, public, hous, smoke-fre, smoke, public hous, health, peopl, tobacco, support, polici, ban, includ, 's, mani, protect, year, live, make, becaus

Topic 6:
smo