In [1]:
from pickle import dump, load
import nltk
from nltk import word_tokenize,FreqDist
import re
from nltk.corpus import wordnet as wn

In [6]:
def get_document_text(raw_text):
    """ This function takes in raw document text as input which we receive from the API and returns a clean text 
    of the associated document. It cleans up any HTML code in the text, newline characters, and extracts supplemental
    information part of the document.
    
    INPUT: string
    OUTPUT: string
    """
    raw_text = raw_text.replace('\n',' ')
    raw_text = raw_text.replace('*','') # added
    raw_text = raw_text.replace('\r',' ') # added
    # Remove any residual HTML tags in text
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_text)
    return cleantext

def tokenize_text(corpus):
    pattern = r'''(?x)    # set flag to allow verbose regexps
    (([A-Z]\.)+)       # abbreviations, e.g. B.C.
    |(\w+([-']\w+)*)       # words with optional internal hyphens e.g. after-ages or author's
    '''
    tokens = nltk.regexp_tokenize(corpus,pattern)
    all_token = [word.lower() for token in tokens for word in token if word != "" and word[0] != "'" and word[0] != "-"]
    return all_token

def tokenize_text_sent(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences    
    return [tokenize_text(sent) for sent in raw_sents]

def tag_my_text(sents):
    return [nltk.pos_tag(sent) for sent in sents]

#Chunk noun phrases in tree 
def noun_phrase_chunker():
    grammar = r"""
    NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
    """
    cp = nltk.RegexpParser(grammar)
    return cp

#Extract only the NP marked phrases from the parse tree, that is the chunk we defined
def noun_phrase_extractor(sentences, chunker):
    res = []
    for sent in sentences:
        tree = chunker.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == 'NP' : 
                res.append(subtree[0:len(subtree)])
                #res.append(subtree[0])
                #print(subtree)
    return res

#remove tags and get only the noun phrases , can be adjusted for length
def noun_phrase_finder(tagged_text):
    all_proper_noun = noun_phrase_extractor(tagged_text,noun_phrase_chunker()) 
    #does not literally mean proper noun. Chunker only extracts common noun
    noun_phrase_list = []                                                      
    #noun_phrase_string_list =[]
    for noun_phrase in all_proper_noun:
        if len(noun_phrase) > 0: #this means where the size of the phrase is greater than 1
            small_list =[]
            for (word,tag) in noun_phrase:
                small_list.append(word)
            noun_phrase_list.append(small_list)
            #noun_phrase_string_list.append(' '.join(small_list))
    return noun_phrase_list

#get freq dist obj for noun phrase of different lengths
def find_freq(nested_list,nest_len):
    #from nltk.probability import FreqDist
    fdist_list =[]
    for inner_np in nested_list:
        if len(inner_np) == nest_len:
            fdist_list.append(' '.join(inner_np))
    fdist = FreqDist(fdist_list)
    return fdist

# #make a grand list of top occuring noun phrases of different sizes --- For testing purpose only. Wont be used
# def get_top_np(np):
#     master_common_list=[]
#     len_list =get_length_np(np).keys()
#     for item in len_list:
#         fdist_np = find_freq(np_list,item)
#         master_common_list.append(fdist_np.most_common(15))
#     return master_common_list

def get_top_unigrams(np):
    unigrams = []
    for item in np:
        if len(item) ==  1:
            unigrams.append(item)
    fdist_uni = find_freq(np,1)
    uni_list = fdist_uni.most_common()
    threshold = 0.3 * len(unigrams)
    top = []
    s = 0
    for word,count in uni_list:
        top.append(word)
        s += count
        if s > threshold:
            break      
    return top

# Lesk algorith for disambiguation in case of multiple synsets of a word
def compare_overlaps_greedy(context, synsets_signatures, pos=None):
    """
    Calculate overlaps between the context sentence and the synset_signature
    and returns the synset with the highest overlap.
    
    :param context: ``context_sentence`` The context sentence where the ambiguous word occurs.
    :param synsets_signatures: ``dictionary`` A list of words that 'signifies' the ambiguous word.
    :param pos: ``pos`` A specified Part-of-Speech (POS).
    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
    """
    # if this returns none that means that there is no overlap
    max_overlaps = 0
    lesk_sense = None
    for ss in synsets_signatures:
        if pos and str(ss.pos()) != pos: # Skips different POS.
            continue
        overlaps = set(synsets_signatures[ss]).intersection(context)
        if len(overlaps) > max_overlaps:
            lesk_sense = ss
            max_overlaps = len(overlaps)  
    return lesk_sense

def lesk(context_sentence, ambiguous_word, pos=None, dictionary=None):
    """
    This function is the implementation of the original Lesk algorithm (1986).
    It requires a dictionary which contains the definition of the different
    sense of each word. See http://goo.gl/8TB15w

        >>> from nltk import word_tokenize
        >>> sent = word_tokenize("I went to the bank to deposit money.")
        >>> word = "bank"
        >>> pos = "n"
        >>> lesk(sent, word, pos)
        Synset('bank.n.07')
    
    :param context_sentence: The context sentence where the ambiguous word occurs.
    :param ambiguous_word: The ambiguous word that requires WSD.
    :param pos: A specified Part-of-Speech (POS).
    :param dictionary: A list of words that 'signifies' the ambiguous word.
    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
    """
    if not dictionary:
        dictionary = {}
        for ss in wn.synsets(ambiguous_word):
            dictionary[ss] = ss.definition().split()
    best_sense = compare_overlaps_greedy(context_sentence, dictionary, pos)
    return best_sense
    #return dictionary 

# this function takes in a word and gets the most relevant synset based on context from the text. 
# for exact algorith refer the text above ("what I want to do" markdown)
def get_synset(word,pos_tag_text ,pos):
    if len(wn.synsets(word)) == 1:
        #print("here1")
        return wn.synsets(word)[0]
    else:
        #get all context sentences
        all_sent =[]
        for sent in pos_tag_text:
            for (w,t) in sent:
                if w == word:
                    all_sent.append(sent)
        #call lesk here
        app_syn = lesk(all_sent[len(all_sent)//2], word, pos)
        if app_syn != None:
            #print("here2")
            return app_syn
        else:
            #second lesk trial with another context sentence
            app_syn = lesk(all_sent[len(all_sent)//3], word, pos)
            if app_syn != None:
                #print("here2")
                return app_syn
            else:
                #give up and choose 1st synset from list with matching pos
                #print("here3")
                all_syns = wn.synsets(word)
                for syn in all_syns:
                    #print(syn.pos())
                    if syn.pos() == pos:
                        return syn
    return False

# this functions take all the single and double legth phrases form grand_list and gets sysnset for all them. (1 each)
def get_singles_synset(uni_list,pos_tag_text):
    single_synset =[]
    #get synsets of all singletons
    for singles in uni_list:
        singles_syn = get_synset(singles,pos_tag_text, 'n')
        if singles_syn:
            single_synset.append(singles_syn)    
    return single_synset

#get common parents
def get_lcs(uni_list,pos_tag_text):
    #get all relevant sysnsets
    all_synsets = get_singles_synset(uni_list,pos_tag_text)
    list_of_all_lcs =[]
    for syn in all_synsets:
        for syn2 in all_synsets[all_synsets.index(syn)+1:]:
            lcs = syn.lowest_common_hypernyms(syn2)[0]
            if lcs not in list_of_all_lcs:
                list_of_all_lcs.append(lcs)
    return list_of_all_lcs

# get themes
def get_theme(uni_list,pos_tag_text):
    # get common parent
    parent_sysnset = get_lcs(uni_list,pos_tag_text)
    # filter out absolute top level and get lemma_names
    lemma_names =[]
    for synset in parent_sysnset:
        if synset.min_depth() != 0:
            #print(synset)
            for each_name in synset.lemma_names():
                if each_name not in lemma_names:
                    lemma_names.append(each_name)
                break
    return lemma_names
    

In [3]:
def get_cluster_count(text):
    cleantext = get_document_text(text)
    tagged_tokens = tag_my_text(tokenize_text_sent(cleantext))
    np_list = noun_phrase_finder(tagged_tokens)
    top_np = get_top_unigrams(np_list)
    themes = get_theme(top_np,tagged_tokens)
    return len(themes)

In [16]:
# Test
doc_list =load(open("data/Master_doc_content",'rb'))
document = doc_list[6]
document_text = str(document['text'][0])


In [17]:
get_cluster_count(document_text)

1