In [1]:
# Imports
from pickle import dump, load
import nltk
from nltk import word_tokenize,FreqDist
import re
from nltk.corpus import wordnet as wn
from nltk.util import ngrams
from sklearn.cluster import KMeans
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import paired_distances
import pandas as pd
import json

In [2]:
#cluster count
def get_document_text(raw_text):
    """ This function takes in raw document text as input which we receive from the API and returns a clean text 
    of the associated document. It cleans up any HTML code in the text, newline characters, and extracts supplemental
    information part of the document.
    
    INPUT: string
    OUTPUT: string
    """
    raw_text = raw_text.replace('\n',' ')
    raw_text = raw_text.replace('*','') # added
    raw_text = raw_text.replace('\r',' ') # added
    # Remove any residual HTML tags in text
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_text)
    return cleantext

def tokenize_text(corpus):
    pattern = r'''(?x)    # set flag to allow verbose regexps
    (([A-Z]\.)+)       # abbreviations, e.g. B.C.
    |(\w+([-']\w+)*)       # words with optional internal hyphens e.g. after-ages or author's
    '''
    tokens = nltk.regexp_tokenize(corpus,pattern)
    all_token = [word.lower() for token in tokens for word in token if word != "" and word[0] != "'" and word[0] != "-"]
    return all_token

def tokenize_text_sent(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences    
    return [tokenize_text(sent) for sent in raw_sents]

def tag_my_text(sents):
    return [nltk.pos_tag(sent) for sent in sents]

#Chunk noun phrases in tree 
def noun_phrase_chunker():
    grammar = r"""
    NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
    """
    cp = nltk.RegexpParser(grammar)
    return cp

#Extract only the NP marked phrases from the parse tree, that is the chunk we defined
def noun_phrase_extractor(sentences, chunker):
    res = []
    for sent in sentences:
        tree = chunker.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'NP' : 
                res.append(subtree[0:len(subtree)])
                #res.append(subtree[0])
                #print(subtree)
    return res

#remove tags and get only the noun phrases , can be adjusted for length
def noun_phrase_finder(tagged_text):
    all_proper_noun = noun_phrase_extractor(tagged_text,noun_phrase_chunker()) 
    #does not literally mean proper noun. Chunker only extracts common noun
    noun_phrase_list = []                                                      
    #noun_phrase_string_list =[]
    for noun_phrase in all_proper_noun:
        if len(noun_phrase) > 0: #this means where the size of the phrase is greater than 1
            small_list =[]
            for (word,tag) in noun_phrase:
                small_list.append(word)
            noun_phrase_list.append(small_list)
            #noun_phrase_string_list.append(' '.join(small_list))
    return noun_phrase_list

#get freq dist obj for noun phrase of different lengths
def find_freq(nested_list,nest_len):
    #from nltk.probability import FreqDist
    fdist_list =[]
    for inner_np in nested_list:
        if len(inner_np) == nest_len:
            fdist_list.append(' '.join(inner_np))
    fdist = FreqDist(fdist_list)
    return fdist

def get_top_unigrams(np):
    unigrams = []
    for item in np:
        if len(item) ==  1:
            unigrams.append(item)
    fdist_uni = find_freq(np,1)
    uni_list = fdist_uni.most_common()
    threshold = 0.3 * len(unigrams)
    top = []
    s = 0
    for word,count in uni_list:
        top.append(word)
        s += count
        if s > threshold:
            break      
    return top

# Lesk algorith for disambiguation in case of multiple synsets of a word
def compare_overlaps_greedy(context, synsets_signatures, pos=None):
    """
    Calculate overlaps between the context sentence and the synset_signature
    and returns the synset with the highest overlap.
    
    :param context: ``context_sentence`` The context sentence where the ambiguous word occurs.
    :param synsets_signatures: ``dictionary`` A list of words that 'signifies' the ambiguous word.
    :param pos: ``pos`` A specified Part-of-Speech (POS).
    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
    """
    # if this returns none that means that there is no overlap
    max_overlaps = 0
    lesk_sense = None
    for ss in synsets_signatures:
        if pos and str(ss.pos()) != pos: # Skips different POS.
            continue
        overlaps = set(synsets_signatures[ss]).intersection(context)
        if len(overlaps) > max_overlaps:
            lesk_sense = ss
            max_overlaps = len(overlaps)  
    return lesk_sense

def lesk(context_sentence, ambiguous_word, pos=None, dictionary=None):
    """
    This function is the implementation of the original Lesk algorithm (1986).
    It requires a dictionary which contains the definition of the different
    sense of each word. See http://goo.gl/8TB15w

        >>> from nltk import word_tokenize
        >>> sent = word_tokenize("I went to the bank to deposit money.")
        >>> word = "bank"
        >>> pos = "n"
        >>> lesk(sent, word, pos)
        Synset('bank.n.07')
    
    :param context_sentence: The context sentence where the ambiguous word occurs.
    :param ambiguous_word: The ambiguous word that requires WSD.
    :param pos: A specified Part-of-Speech (POS).
    :param dictionary: A list of words that 'signifies' the ambiguous word.
    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
    """
    if not dictionary:
        dictionary = {}
        for ss in wn.synsets(ambiguous_word):
            dictionary[ss] = ss.definition().split()
    best_sense = compare_overlaps_greedy(context_sentence, dictionary, pos)
    return best_sense
    #return dictionary 

# this function takes in a word and gets the most relevant synset based on context from the text. 
# for exact algorith refer the text above ("what I want to do" markdown)
def get_synset(word,pos_tag_text ,pos):
    if len(wn.synsets(word)) == 1:
        #print("here1")
        return wn.synsets(word)[0]
    else:
        #get all context sentences
        all_sent =[]
        for sent in pos_tag_text:
            for (w,t) in sent:
                if w == word:
                    all_sent.append(sent)
        #call lesk here
        app_syn = lesk(all_sent[len(all_sent)//2], word, pos)
        if app_syn != None:
            #print("here2")
            return app_syn
        else:
            #second lesk trial with another context sentence
            app_syn = lesk(all_sent[len(all_sent)//3], word, pos)
            if app_syn != None:
                #print("here2")
                return app_syn
            else:
                #give up and choose 1st synset from list with matching pos
                #print("here3")
                all_syns = wn.synsets(word)
                for syn in all_syns:
                    #print(syn.pos())
                    if syn.pos() == pos:
                        return syn
    return False

# this functions take all the single and double legth phrases form grand_list and gets sysnset for all them. (1 each)
def get_singles_synset(uni_list,pos_tag_text):
    single_synset =[]
    #get synsets of all singletons
    for singles in uni_list:
        singles_syn = get_synset(singles,pos_tag_text, 'n')
        if singles_syn:
            single_synset.append(singles_syn)    
    return single_synset

#get common parents
def get_lcs(uni_list,pos_tag_text):
    #get all relevant sysnsets
    all_synsets = get_singles_synset(uni_list,pos_tag_text)
    list_of_all_lcs =[]
    for syn in all_synsets:
        for syn2 in all_synsets[all_synsets.index(syn)+1:]:
            lcs = syn.lowest_common_hypernyms(syn2)[0]
            if lcs not in list_of_all_lcs:
                list_of_all_lcs.append(lcs)
    return list_of_all_lcs

# get themes
def get_theme(uni_list,pos_tag_text):
    # get common parent
    parent_sysnset = get_lcs(uni_list,pos_tag_text)
    # filter out absolute top level and get lemma_names
    lemma_names =[]
    for synset in parent_sysnset:
        if synset.min_depth() != 0:
            #print(synset)
            for each_name in synset.lemma_names():
                if each_name not in lemma_names:
                    lemma_names.append(each_name)
                break
    return lemma_names

def get_cluster_count(document):
    text = str(document['text'][0])
    cleantext = get_document_text(text)
    tagged_tokens = tag_my_text(tokenize_text_sent(cleantext))
    np_list = noun_phrase_finder(tagged_tokens)
    top_np = get_top_unigrams(np_list)
    themes = get_theme(top_np,tagged_tokens)
    return len(themes)

In [22]:
#Comment clustering
def process_document(document):
    comments = []
    for c in document['comment_list']:
        c = c.replace('\n',' ')
        if 'attached' not in c or len(c) > 500:
            comments.append(str(c))
    return comments

# Modified from Brandon Rose:
def tokenize_text_cluster(text):
    #tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    # changing this to get rid of tokens like 's 
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in tokenize_text(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

def stem_text(text):
    stemmer = SnowballStemmer('english')
    tokens = tokenize_text_cluster(text)
    stems = [stemmer.stem(t) for t in tokens]
    return stems

def vectorize_comments(comments):
    tfidf_vec = TfidfVectorizer(tokenizer=tokenize_text_cluster,
                                stop_words='english',
                                ngram_range=(1,4),
                                min_df=0.1, max_df=0.8,
                                max_features=200000)
    tfidf_matrix = tfidf_vec.fit_transform(comments)
    return tfidf_matrix, tfidf_vec

# Modified from Brandon Rose:
def vocabulary_frame(text):
    tokens = tokenize_text_cluster(text)
    stems = stem_text(text)
    return pd.DataFrame({'words': tokens}, index = stems).drop_duplicates()

def extended_vocabulary_frame(texts):
    frames = []
    for t in texts:
        vf = vocabulary_frame(t)
        frames.append(vf)
    extended = pd.concat(frames).drop_duplicates()
    return extended

# Modified from Brandon Rose and
# http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html
# http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#sphx-glr-auto-examples-text-document-clustering-py
def top_words(model, num_clusters, comments, tfidf_vec, n_top_words):
    feature_names = tfidf_vec.get_feature_names()
    order_centroids = model.cluster_centers_.argsort()[:, ::-1] 
    top_words = []
    for i in range(num_clusters):
        temp_top_words = []
        for j in order_centroids[i, :n_top_words]:
            temp_top_words.append(feature_names[j])
        top_words.append(temp_top_words)
    return top_words

def cluster_comments(document, num_clusters):
    cluster_dict = {}

    comments = process_document(document)
    tfidf_matrix, tfidf_vec = vectorize_comments(comments)
    
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    
    cluster_center_list = []
    for c in clusters:
        cluster_center_list.append(km.cluster_centers_[c])
    center_distances = paired_distances(tfidf_matrix, cluster_center_list)
    
    comment_clusters = {'comment': comments, 'cluster': clusters, 'dist': center_distances}
    comment_frame = pd.DataFrame(comment_clusters, index = [clusters] , columns = ['comment', 'cluster', 'dist'])
    
    central_comments = []
    all_comments = []
    for i in range(num_clusters):
        central_comments.append(comment_frame[comment_frame.cluster==i].min().comment)
        all_comments.append(list(comment_frame[comment_frame.cluster==i]['comment']))
    
    freq_words = top_words(km, num_clusters, comments, tfidf_vec, 6)
    
    cluster_dict['central_comments'] = central_comments
    cluster_dict['all_comments'] = all_comments
    cluster_dict['top_words'] = freq_words
    
    return cluster_dict

In [4]:
doc_list =load(open("data/Master2_doc_content",'rb'))
document = doc_list[0]

In [5]:
cluster_num = get_cluster_count(document)

In [6]:
clust_dict = cluster_comments(document, cluster_num)

In [11]:
clust_dict.keys()

dict_keys(['central_comments', 'all_comments', 'top_words'])

In [27]:
clust_dict['top_words']


[['government', 'live', 'rights', 'smoking', 'homes', 'smoke'],
 ['free', 'smoke free', 'smoke', 'housing', 'public', 'public housing'],
 ['smoking', 'people', 'ban', 'housing', 'smoke', 'public'],
 ['c', 'd', 't', 's', 'e', 'like'],
 ['policy', 'smoking', 'residents', 'housing', 'rule', 'smoke'],
 ['housing', 'smoke-free', 'public', 'rule', 'public housing', 'proposed'],
 ['smoke', 'apartment', 'building', 'smoking', 'apartments', 'smokers'],
 ['smoke', 'tobacco', 'health', 'housing', 'secondhand', 'secondhand smoke'],
 ['smoke', 'hand', 'hand smoke', 'second hand', 'second', 'second hand smoke']]

### BEGIN TESTING

In [23]:
def mash_comments(cluster_dict):
    big_comment = []
    for cluster in cluster_dict["all_comments"]:
        mashed = ""
        for comment in cluster:
            mashed += comment
            mashed += " "
        big_comment.append(mashed)
    return big_comment

In [28]:
mashed_comments = mash_comments(clust_dict)


In [30]:
mashed_comments[0]

'I object to government smoking bans in general and in this specific case on the grounds that government has no legal basis to assume authority to protect a citizen from themself and that no scientific basis exists that smoking injures another citizen. I object to this government overreach. I\'ve read all the comments posted. I\'ve learned much more than I thought I would. One comment made concerns that there is a relationship between the Clean Indoor Air movement, and the much larger Clean Air Movement. In the larger Clean Air Movement, government doesn\'t restrict the end user of an automobile, instead the government legislates that the manufacturer provide products that are better for the environment. Only with the Tobacco Industry we find government agencies trying to restrict and modify a person\'s behavior. I don\'t have any opposition to this regulation if it is amended to include all Smokable Products, as well as the use of Smokable Medical Marijuana, and other things that can 

In [None]:
# TESTING NEW THINGS TESTING NEW THINGS TESTING NEW THINGS TESTING NEW THINGS
# def top_words(model, num_clusters, comments, tfidf_vec, n_top_words):
#     feature_names = tfidf_vec.get_feature_names()
#     comment_vf = extended_vocabulary_frame(comments)
#     order_centroids = model.cluster_centers_.argsort()[:, ::-1] 
#     top_words = []
#     for i in range(num_clusters):
#         temp_top_words = []
#         for j in order_centroids[i, :n_top_words]:
#             temp_top_words.append(feature_names[j])
#         top_words.append(temp_top_words)
#     return top_words

# def cluster_comments(document, num_clusters):
#     cluster_dict = {}

#     comments = process_document(document)
#     tfidf_matrix, tfidf_vec = vectorize_comments(comments)
    
#     km = KMeans(n_clusters=num_clusters)
#     km.fit(tfidf_matrix)
#     clusters = km.labels_.tolist()
    
#     # FOR TESTING
#     return km, tfidf_vec, comments

In [6]:
cluster_num

8

In [7]:
km, tfidf_vec, comments = cluster_comments(document, cluster_num)

In [8]:
TEST = top_words(km, cluster_num, comments, tfidf_vec, 6)

AttributeError: 'str' object has no attribute 'get_feature_names'

In [42]:
for i in TEST:
    print(i)

['smoke', 'health', 'tobacco', 'housing', 'secondhand', 'public']
['smoke', "n't", 'people', "'s", 'smoking', 'apartment']
['policy', 'housing', 'smoking', 'smoke-free', 'residents', 'smoke']
['free', 'smoke free', 'smoke', 'housing', 'public', 'public housing']
['s', 'government', 'smoking', 'smoker', 'e', 'proposal']
['smoke-free', 'housing', 'public', 'public housing', 'smoke-free public housing', 'smoke-free public']
['smoking', 'ban', 'housing', 'public', 'public housing', 'health']
['rule', 'housing', 'proposed rule', 'proposed', 'public', 'hud']
['smoke', 'hand', 'second hand', 'hand smoke', 'second', 'second hand smoke']


In [37]:
tfidf_vec.vocabulary_

{"'s": 0,
 '5597-p-02': 1,
 'able': 2,
 'additional': 3,
 'agencies': 4,
 'air': 5,
 'allow': 6,
 'allowed': 7,
 'american': 8,
 'apartment': 9,
 'apartments': 10,
 'area': 11,
 'areas': 12,
 'asthma': 13,
 'authorities': 14,
 'authority': 15,
 'ban': 16,
 'based': 17,
 'believe': 18,
 'benefits': 19,
 'building': 20,
 'buildings': 21,
 'ca': 22,
 'cancer': 23,
 'cause': 24,
 'cessation': 25,
 'children': 26,
 'cigarette': 27,
 'cigarettes': 28,
 'clean': 29,
 'comment': 30,
 'comments': 31,
 'common': 32,
 'community': 33,
 'concern': 34,
 'control': 35,
 'cost': 36,
 'costs': 37,
 'counsel': 38,
 'create': 39,
 'current': 40,
 'dc': 41,
 'department': 42,
 'department housing': 43,
 'department housing urban': 44,
 'department housing urban development': 45,
 'development': 46,
 'disease': 47,
 'division': 48,
 'docket': 49,
 'docket fr': 50,
 'does': 51,
 'e': 52,
 'effects': 53,
 'elderly': 54,
 'electronic': 55,
 'encourage': 56,
 'enforcement': 57,
 'environment': 58,
 'especiall

### END TESTING

In [46]:
n = get_cluster_count(document)



In [47]:
test = cluster_comments(document, n)

In [48]:
test.keys()

dict_keys(['central_comments', 'all_comments', 'top_words'])

In [49]:
len(test['central_comments'])

9

In [51]:
for i in test['central_comments']:
    print(i)
    print('\n\n\n\n')

    RE: Proposed Rule by HUD, Docket No. FR 5597-P-02, Instituting Smoke-Free Public Housing       I oppose this rule.  All of the reasons for being against this can already be found in all the other comments that object to it and with which I agree.  So I'll only point out two things that might not have been covered yet.     1)  In April 2014 the Hawaii Public Housing Authority (HPHA) rescinded the same kind of ban in its own properties that it had instituted in February 2013 because HPHA determined that "it needed a law to be passed before it can make such drastic changes."  In other words, this kind of action could not be taken by an agency and must go through the state legislature and voted on by lawmakers.       If Hawaii understood this to be the case then what has made the bans in public housing, imposed by local Authorities scattered around the country, legitimate?  Now as importantly, how can HUD's (an agency) rule be lawful?     (see http://khon2.com/2014/04/25/state-halts-no