In [1]:
import io
import itertools
import networkx as nx
import nltk
import editdistance

In [3]:
def setup_envt():
    nltk.download('punkt')
    nltk.download('averaged perceptron tagger')
    print "Download complete"

In [4]:
def filter_tags(tagged, tags=['NN','JJ','NNP']):
    filtered=[]
    for item in tagged:
        if item[1] in tags:
            filtered.append(item)
    print "All tokens tagged."
    return filtered
    

In [6]:
def normalized(tagged):
    norm=[]
    for item in tagged:
        norm.append(item[0].replace('.',''),item[1])
    return norm

In [7]:
def unique_set(iterable, key=None):
    seen=set()
    seen_add=seen.add
    if key is None:
        for element in [x for x in iterable if x not in seen]:
            seen_add(element)
            yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element

In [8]:
def build_graph(nodes):
    gr=nx.Graph()
    gr.add_nodes_from(nodes)
    pairednodes=list(itertools.combinations(nodes,2))
    for pair in pairednodes:
        fstring=pair[0]
        sstring=pair[1]
        levDist=editdistance.eval(fstring,sstring)
        gr.add_edge(fstring,sstring,weight=levDist)
    return gr

In [10]:
 #extracting keywords and key-phrases for a document queried.
def extract_keyphrases(text):
    word_tokens=nltk.word_tokenize(text)
    tagged=nltk.pos_tag(word_tokens)
    textlist=[x[0] for x in tagged]
    
    tagged=filter_tags(tagged)
    tagged=normalize(tagged)
    
    unique_words=unique_set([x[0] for x in tagged])
    word_set=list(unique_words)
    
    graph=build_graph(word_set)
    calc_rank=nx.pagerank(graph,weight='weight')
    
    keyphrases=sorted(calc_rank, key=calc_rank.get, reverse=True)
    keyphrases=keyphrases[0:len(wordset)//3 + 1]
    modified_phrases=set([])
    dealt_with=set([])
    i,j=0,0
    while j<len(textlist):
        first=textlist[i]
        second=textlist[j]
        if first and second in keyphrases:
            keyphrase = first + ' ' + second #create phrase
            modified_phrases.add(keyphrase)
            dealt_with.add(first)
            dealt_with.add(second)
        else:
            if first in keyphrases and first not in dealt_with:
                modified_phrases.add(first)
            if j==len(textlist)-1 and second in keyphrases and second not in dealt_with:
                modified_phrases.add(second)
        i=i+1; j=j+1
        
    return modified_phrases

In [11]:
def extract(text, summary_len, clean_sentences=False, language='English'):
    sent_detector = nltk.data.load('tokenizers/punkt/'+language+'.pickle')
    sentence_tokens = sent_detector.tokenize(text.strip())
    graph = build_graph(sentence_tokens)

    calculated_page_rank = nx.pagerank(graph, weight='weight')
    sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True)
    summary = ' '.join(sentences)
    summary_words = summary.split()
    summary_words = summary_words[0:summary_length]
    dot_indices = [idx for idx, word in enumerate(summary_words) if word.find('.') != -1]
    if clean_sentences and dot_indices:
        last_dot = max(dot_indices) + 1
        summary = ' '.join(summary_words[0:last_dot])
    else:
        summary = ' '.join(summary_words)

    return summary

In [12]:
def check_output(summary, key_phrases, fname):
    kp_file=open(fname,"w+")
    for item in key_phrases:
        kp_file.write(item+"\n")
    kp_file.close()
    print "KEYWORDS GENERATED"
    
    sum_file=open("summary_"+fname,"w+")
    sum_file.write(summary)
    sum_file.close()
    print "SUMMARY GENERATED"
    

In [13]:
"""
def summarize():
    #sample check for NLP
    articles = os.listdir("articles")
    for article in articles:
        print('Reading articles/' + article)
        article_file = io.open('articles/' + article, 'r')
        text = article_file.read()
        keyphrases = extract_keyphrases(text)
        summary = extract(text)
        check_output(summary, keyphrases, article)
    
"""

'\ndef summarize():\n    #sample check for NLP\n    articles = os.listdir("articles")\n    for article in articles:\n        print(\'Reading articles/\' + article)\n        article_file = io.open(\'articles/\' + article, \'r\')\n        text = article_file.read()\n        keyphrases = extract_keyphrases(text)\n        summary = extract(text)\n        check_output(summary, keyphrases, article)\n    \n'

In [None]:
nltk.download('')