In [4]:
from collections import Counter
import re

import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from scipy.stats import entropy as kl_div
from math import log

from tqdm import tqdm
from elasticsearch import Elasticsearch

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
es = Elasticsearch([{'host': '172.18.0.3', 'port': 9200}])

In [13]:
_text = {
    'size' : 301,
    'query': {
        'match_all' : {}
    }
}

data = es.search(index='duc', doc_type='doc', body=_text)['hits']['hits']
data = [d['_source'] for d in data]

In [14]:
_topics = {
    'size' : 10,
    'query': {
        'match_all' : {}
    }
}
topics = es.search(index='topicsduc', doc_type='topic', body=_topics)['hits']['hits']
topics = [t['_source'] for t in topics]

In [75]:
es.search(index='duc', doc_type='doc', body=_text)['hits']['hits'][0]['_source'].keys()

dict_keys(['doc_id', 'doc_text', 'doc_topics', 'doc_topics_pd', 'gold_summary'])

In [17]:
regex = re.compile(r"(\W)")
wc = lambda text: Counter([t for t in re.split(r"(\W)", text) if t and t != ' ' and t != '\n'])
pd = lambda wc: {k: v/sum(wc.values()) for k, v in wc.items()}

In [15]:
def klsum(document, summary, L):
    doc_sent = sent_tokenize(document)
    doc_wc = wc(document)
    doc_pd = pd(doc_wc)
    px = [p for p in doc_pd.values()]
    
    while len(sent_tokenize(summary)) < L:
        sentences = sent_tokenize(document)
        
        _min, _min_id = 999, -1
        for idx, sent in enumerate(sentences):
            new_sum = summary + sent
            new_pd = pd(wc(new_sum))
            qx = [new_pd[k] if k in new_pd else 0.001 for k in doc_pd.keys()]
            kl = kl_div(px, qx)
            if kl < _min:
                _min, _min_id = kl, idx
        
        summary += "\n" + sentences[_min_id]
        document = " ".join(sentences[:_min_id] + sentences[_min_id+1:])
    
    return summary.strip()

In [25]:
sent_tokenize(data[0]['doc_text'])

['THE 74-year-old man said it was "like needles or nails being stuck in" when\npolice handcuffed him behind his back as he lay face down.',
 'Then the police\nlifted him by his arms and dragged him from the scene.',
 'Retired Roman\nCatholic Bishop George Lynch of New York was describing his treatment by West\nHartford, Conn., police during a 1989 anti-abortion demonstration.',
 'In mid-June, a lawsuit against the Los Angeles Police Department was\nsettled when police agreed to stop using a martial-arts weapon, nunchakus,\nwhile arresting anti-abortion advocates.',
 'Hundreds have charged that police\nin more than 50 cities have used excessive force in removing demonstrators\nintent on closing down abortion facilities.',
 'The demonstrators, most associated\nwith Operation Rescue, have charged and testified that police tactics used\nduring the past 2 1/2 years in cities such as Denver, Atlanta, Pittsburgh and\nLos Angeles have resulted in serious injury and led to sexual abuse against\

In [18]:
kl_summaries = {}
for d in data:
    kl_summaries[d['doc_id']] = klsum(d['doc_text'], '', 1)

In [19]:
def ldasum(data, summary, topics, L):
    doc = data['doc_text']
    doc_topics = [int(t.strip()) for t in data['doc_topics'].split(',')]
    doc_topics_pd = [float(t.strip()) for t in data['doc_topics_pd'].split(',')]
    
    while len(sent_tokenize(summary)) < L:
        sentences = sent_tokenize(doc)
        score = [0] * len(sentences)
        for idx, topic in enumerate(topics):
            if topic['topic_id'] in doc_topics:
                for sidx, sent in enumerate(sentences):
                    for word in word_tokenize(sent):
                        if word in topic['top_words']:
                            score[sidx] += topic['word_prob'][topic['top_words'].index(word)]
        
        best = np.argmax(score)
        summary += ' \n ' + sentences[best]
        doc = " ".join(sentences[:best] + sentences[best+1:])
        
    return summary.strip()

lda_summaries = {}
for d in tqdm(data):
    lda_summaries[d['doc_id']] = ldasum(d, '', topics, 1)

100%|██████████| 301/301 [00:09<00:00, 32.24it/s]


## Upload to ES

In [57]:
for d in tqdm(data):
    _id = d['doc_id']
    d['kl_summary'] = kl_summaries[_id]
    d['lda_summary'] = lda_summaries[_id]
    
    es.index(index='ducsummary', doc_type='doc', body=d)

100%|██████████| 301/301 [00:02<00:00, 116.74it/s]


In [20]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
duc_data = [row['doc_text'] for row in data] 
vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2, max_features=1000)
features = vectorizer.fit_transform(duc_data)
model = LatentDirichletAllocation(n_components=10, random_state=666, learning_method='online', n_jobs=-1)
model.fit(features)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=666, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

## Summarization

In [21]:
def topicsum(document, summary, L):
    px = model.transform(vectorizer.transform([document]))[0]
    
    while len(sent_tokenize(summary)) < L:
        sentences = sent_tokenize(document)
        
        _min, _min_id = 999, -1
        for idx, sent in enumerate(sentences):
            new_sum = summary + sent
            qx = model.transform(vectorizer.transform([new_sum]))[0]
            kl = kl_div(px, qx)
            if kl < _min:
                _min, _min_id = kl, idx
        
        summary += "\n" + sentences[_min_id]
        document = " ".join(sentences[:_min_id] + sentences[_min_id+1:])
    
    return summary.strip()

In [22]:
data[0]['doc_text']

"A U.S. Air Force F-111\nfighter-bomber crashed today in Saudi Arabia, killing both crew\nmembers, U.S. military officials reported.\n   It was the fourth American aircraft to crash in three days among\nthose deployed to this kingdom for Operation Desert Shield.\n   Eight Marines are missing in the crash of two helicopters in the\nnorthern Arabian Sea on Monday. An Air Force F-4 reconnaissance jet\nalso went down that day, killing both crew members.\n   Lt. Cmdr. J.D. van Sickle, a military spokesman, said the F-111\ncrashed in the ``southern Arabian peninsula'' while on a training\nmission and that the incident was under investigation.\n   The names of the flyers were withheld pending notification of\nnext of kin.\n   The aircraft was attached the 48th Tactical Fighter Wing based\nat RAF Lakenheath Air Base in Britain. The wing was sent to Turkey\nas U.S. forces massed in the region in response to Iraq's Aug. 2\ntakeover of Kuwait.\n   Today's crash brought to at least nine the number

In [90]:
#print(topicsum(docs['AP880217-0175'], '', 2))
print(topicsum(data[0]['doc_text'], '', 2))

In addition to those killed in Saudi Arabia, 13 other Air Force
personnel were killed in a crash of a C-5 jet cargo plane in
Germany.
A U.S. Air Force F-111
fighter-bomber crashed today in Saudi Arabia, killing both crew
members, U.S. military officials reported.


In [93]:
data[0].keys()

dict_keys(['doc_id', 'doc_text', 'doc_topics', 'doc_topics_pd', 'gold_summary'])

In [94]:
data[0]['doc_topics_pd']

'0.7423522725775452, 0.24875628037352615, 0.0011118752154525915, 0.0011115325210887136, 0.0011114431054552103'

In [95]:
data[0]['doc_topics']

'3, 1, 7, 2, 8'

### Three different summaries

In [96]:
data[0]['gold_summary']

"A United States Air Force F-111 fighter-bomber crashed at dawn today in Saudi Arabia. Both crew members were killed. The aircraft was assigned to the 48th Tactical Fighter Wing at Lakenheath in Britain, and was deployed to Saudi Arabia via Turkey as Operation Desert Shield progressed. The F-111 crash followed closely upon Monday's disappearance of two Marine Corps UH-1 Huey helicopters over the Arabian Sea, and the crash of an Air Force F-4 reconnaissance jet on the same day. Today's crash raises the number of Americans killed in Saudi Arabia to at least nine."

In [97]:
_text = {
    'size' : 301,
    'query': {
        'match_all' : {}
    }
}

data = es.search(index='ducsummary', doc_type='doc', body=_text)['hits']['hits']
data = [d['_source'] for d in data]

In [101]:
data[0]['kl_summary']

'The aircraft was an F-model, the latest version of the\n23-year-old swing-wing jet that first saw action in Vietnam.'

In [102]:
data[0]['lda_summary']

"van Sickle, a military spokesman, said the F-111\ncrashed in the ``southern Arabian peninsula'' while on a training\nmission and that the incident was under investigation."

In [1]:
from rouge import Rouge

In [27]:
def indexer(index, doc_type, body):
    data = es.search(index=index, doc_type=doc_type, body=body)['hits']['hits']
    data = [d['_source'] for d in data]
    return data

In [28]:
_text = {
    'size' : 301,
    'query': {
        'match_all' : {}
    }
}
duc_es = indexer('ducsummary', 'doc', _text)

In [30]:
for d in duc_es:
    if d['doc_id'] == 'AP880217-0175':
        test = d['doc_text']
        ref = d['gold_summary']
        lda = d['lda_summary']
        kl = d['kl_summary']

In [34]:
hyp = topicsum(test, '', 2)

In [32]:
rouge = Rouge()

In [35]:
rouge.get_scores(hyp, ref)[0]['rouge-2']['f']


0.14012738372996894

In [36]:
rouge.get_scores(lda, ref)[0]['rouge-2']['f']


0.3053435073970049

In [37]:
rouge.get_scores(kl, ref)[0]['rouge-2']['f']


0.0312499960986333