# Extractive Summarization

#### Implement the KL-Sum summarization method for each dataset. Follow the ideas in this paper ; you are allowed to use libraries for text cleaning, segmentation into sentences, etc. Run it twice : 
- A) KL_summary based on words_PD; PD is a distribution proportional to counts of words in document 
- B) LDA_summary based on LDA topics_PD on obtained in PB2. The only difference is that PD, while still a distribution over words, is computed using topic modeling
- ES: Add two new fields to the document type, "KL_summary" and "LDA_summary" to store the obtained summaries. 

For DUC dataset evaluate KL_summaries and LDA_summaries against human gold summaries with ROUGE. ROUGE Perl package 

EXTRA CREDIT. KL Summarization: Can we make both PD and PS distributions over topics, instead of distributions over words? Would that help? 

In [84]:
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter
from elasticsearch import Elasticsearch
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from subprocess import check_output
from subprocess import check_call
import json

In [2]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
stop = stopwords.words('english') + list(string.punctuation)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stop_words = stopwords.words('english')

### KL Summarization over words_PD

In [3]:
def KL(p, q):
    result = 0
    for k, v in q.items():
        result += p[k] * np.log(p[k]/np.float32(q[k]))
    return result

In [4]:
def removeStopWords(input, stop_words):
    output = []
    for w in input:
        if w.lower() not in stop_words:
            output.append(w)
    return output

In [5]:
def relativeFrequency(input):
    input_frequency = Counter(input)
    len_input = len(input)
    for k, v in input_frequency.items():
        input_frequency[k] = v/np.float32(len_input)
    return input_frequency

In [6]:
def jointFrequency(sentence, summ):
    [summ.append(s) for s in sentence]
    return relativeFrequency(summ)

In [7]:
def get_summary_as_words(input, stop_words):
    if len(input) == 0:
        return input
    clean_input_sentences_words = [removeStopWords(word_tokenize(i.translate(string.punctuation)), stop_words) for i in input] 
    input_flat = [ii for i in clean_input_sentences_words for ii in i]
    return input_flat

In [8]:
def klsum(document, L = 5):
    
    document_as_sentences = tokenizer.tokenize(document)
    clean_input_words = removeStopWords(word_tokenize(document.translate(string.punctuation)), stop_words)
    clean_input_sentences_words = [removeStopWords(word_tokenize(i.translate(string.punctuation)), stop_words) for i in document_as_sentences]
    clean_input_word_frequency = relativeFrequency(clean_input_words)
    
    i = 0
    summary = []
    score = {}
    while(i < L and len(document_as_sentences) > 0):
        kl_list = []
        clean_summary_words = get_summary_as_words(summary, stop_words)
        for s in clean_input_sentences_words:
            joint_frequency = jointFrequency(s, clean_summary_words[:])
            kl_list.append(KL(clean_input_word_frequency, joint_frequency))
        best = kl_list.index(min(kl_list))
        summary.append(document_as_sentences[best])
        del document_as_sentences[best]
        del clean_input_sentences_words[best]
        i += 1
    return summary

In [9]:
# Test for one document
klsum(documents[0])

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


['The U.S. media as a whole seem to try to\nignore them.',
 'What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation.',
 'The U.S. media is\nthe most pro-israeli media in the world.',
 'That is rediculous.',
 "Well i'm not sure about the story nad it did seem biased."]

In [10]:
# Connect to Elastic search and add KL Summary
es = Elasticsearch([{'host': '192.168.99.100', 'port': 9200}])

In [16]:
# Try getting data from the ES source and summarizing it
true = es.get(index="ng_data", doc_type='test', id=1)['_source']['doc_text']
summ = klsum(true)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [27]:
true

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----"

In [28]:
summ

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small."

#### Add KL Summaries for every 20NG document

In [31]:
print('Pushing KL Summaries for each 20NG document')

for i in range(len(documents)):
    id_i = i+1
    true = es.get(index="ng_data", doc_type='test', id=id_i)['_source']['doc_text']
    summ = klsum(true)
    summ = ' '.join(summ)
    es.update(index='ng_data', doc_type='test', id=id_i,\
              body={'doc':{'KL_Summary':summ}})
    if i%1000 == 0: print(i,'thousandth summary pushed.')

Pushing KL Summaries for each 20NG document
0 thousandth summary pushed.


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


1000 thousandth summary pushed.
2000 thousandth summary pushed.
3000 thousandth summary pushed.
4000 thousandth summary pushed.
5000 thousandth summary pushed.
6000 thousandth summary pushed.
7000 thousandth summary pushed.
8000 thousandth summary pushed.
9000 thousandth summary pushed.
10000 thousandth summary pushed.
11000 thousandth summary pushed.


#### Add KL Summaries for every DUC document

In [33]:
print('Pushing KL Summaries for each DUC document')

for i in range(308):
    id_i = i+1
    true = es.get(index="duc_data", doc_type='test', id=id_i)['_source']['doc_text']
    summ = klsum(true)
    summ = ' '.join(summ)
    es.update(index='duc_data', doc_type='test', id=id_i,\
              body={'doc':{'KL_Summary':summ}})
    if i%100 == 0: print(i,'hundredth summary pushed.')

Pushing KL Summaries for each DUC document


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


0 hundredth summary pushed.
100 hundredth summary pushed.
200 hundredth summary pushed.
300 hundredth summary pushed.


### LDA Summarization over document-topic distribution

In [42]:
def ldasum(input, input_frequency, L = 5):
    
    def KL(p, q):
        result = 0
        for k, v in q.items():
            if k in p:
                result += p[k] * np.log(p[k]/np.float32(q[k]))
        return result
    
    def removeStopWords(input, stop_words):
        output = []
        for w in input:
            if w.lower() not in stop_words:
                output.append(w)
        return output
    
    def relativeFrequency(input):
        input_frequency = Counter(input)
        len_input = len(input)
        for k, v in input_frequency.items():
            input_frequency[k] = v/np.float32(len_input)
        return input_frequency
    
    def jointFrequency(sentence, summ):
        [summ.append(s) for s in sentence]
        return relativeFrequency(summ)
    
    def get_summary_as_words(input, stop_words):
        if len(input) == 0:
            return input
        clean_input_sentences_words = [removeStopWords(word_tokenize(i.translate(string.punctuation)), stop_words) for i in input] 
        input_flat = [ii for i in clean_input_sentences_words for ii in i]
        return input_flat
    
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    stop_words = stopwords.words('english')
    input_sentences = tokenizer.tokenize(input)
    clean_input_words = removeStopWords(word_tokenize(input.translate(string.punctuation)), stop_words)
    clean_input_sentences_words = [removeStopWords(word_tokenize(i.translate(string.punctuation)), stop_words) for i in input_sentences]
    # -- difference between lda and kl
    # clean_input_word_frequency = relativeFrequency(clean_input_words)
    clean_input_word_frequency = input_frequency
    
    i = 0
    summary = []
    score = {}
    while(i < L and len(input_sentences) > 0):
        kl_list = []
        clean_summary_words = get_summary_as_words(summary, stop_words)
        for s in clean_input_sentences_words:
            joint_frequency = jointFrequency(s, clean_summary_words[:])
            kl_list.append(KL(clean_input_word_frequency, joint_frequency))
        best = kl_list.index(min(kl_list))
        summary.append(input_sentences[best])
        del input_sentences[best]
        del clean_input_sentences_words[best]
        i += 1
    return summary

#### Create LDA dcoument-to-topic PDF matrix for 20NG

In [43]:
tf_vectorizer_20ng = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf_20ng = tf_vectorizer_20ng.fit_transform(documents)
def run_lda(no_topics, tf):
    # Run LDA
    lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
    
    return lda
lda_20 = run_lda(20, tf_20ng)
doc_topic_20ng = lda_20.transform(tf_20ng)

In [44]:
ldasum(documents[0], doc_topic_20ng[0,:])

  


["Well i'm not sure about the story nad it did seem biased.",
 'What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation.',
 'That is rediculous.',
 'The U.S. media is\nthe most pro-israeli media in the world.',
 'Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured.']

#### Add LDA Summaries for every 20NG document

In [46]:
print('Pushing LDA Summaries for each 20NG document')

for i in range(len(documents)):
    id_i = i+1
    true = es.get(index="ng_data", doc_type='test', id=id_i)['_source']['doc_text']
    summ = ldasum(true, doc_topic_20ng[i, :])
    summ = ' '.join(summ)
    es.update(index='ng_data', doc_type='test', id=id_i,\
              body={'doc':{'LDA_Summary':summ}})
    if i%1000 == 0: print(i,'thousandth summary pushed.')

Pushing LDA Summaries for each 20NG document
0 thousandth summary pushed.


  


1000 thousandth summary pushed.
2000 thousandth summary pushed.
3000 thousandth summary pushed.
4000 thousandth summary pushed.
5000 thousandth summary pushed.
6000 thousandth summary pushed.
7000 thousandth summary pushed.
8000 thousandth summary pushed.
9000 thousandth summary pushed.
10000 thousandth summary pushed.
11000 thousandth summary pushed.


#### Create LDA dcoument-to-topic PDF matrix for DUC

In [50]:
duc_doc = []
for i in range(308):
    duc_doc.append(es.get(index="duc_data", doc_type='test', id=i+1)['_source']['doc_text'])

tf_vectorizer_duc = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf_duc = tf_vectorizer_duc.fit_transform(duc_doc)
def run_lda(no_topics, tf):
    # Run LDA
    lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
    
    return lda
lda_duc = run_lda(20, tf_duc)
doc_topic_duc = lda_duc.transform(tf_duc)

#### Add LDA Summaries for every DUC document

In [53]:
print('Pushing LDA Summaries for each DUC document')

for i in range(308):
    id_i = i+1
    true = es.get(index="duc_data", doc_type='test', id=id_i)['_source']['doc_text']
    summ = ldasum(true, doc_topic_duc[i, :])
    summ = ' '.join(summ)
    es.update(index='duc_data', doc_type='test', id=id_i,\
              body={'doc':{'LDA_Summary':summ}})
    if i%100 == 0: print(i,'hundredth summary pushed.')

Pushing LDA Summaries for each DUC document
0 hundredth summary pushed.


  


100 hundredth summary pushed.
200 hundredth summary pushed.
300 hundredth summary pushed.


### Rouge evaluation

In [68]:
hypothesis = "foo"
reference = "bar"

In [88]:
cmnd = ["rouge.exe", hypothesis, reference]

In [89]:
import subprocess
try:
    output = subprocess.check_output(
        cmnd, stderr=subprocess.STDOUT, shell=True, timeout=3,
        universal_newlines=True)
except subprocess.CalledProcessError as exc:
    print("Status : FAIL", exc.returncode, exc.output)
else:
    print("Output: \n{}\n".format(output))

Status : FAIL 1 


In [93]:
es.get(index="duc_data", doc_type='test', id=id_i)['_source']

{'KL_Summary': 'One\nletter writer who also knows something about the\nConstitution, though disagreeing with me on another point,\nagreed on the central issue, that regulation of gun ownership\nand use does not violate the Constitution. Another letter writer thinks I have confused the issue by\ncalling the duty to serve in the militia a right to serve,\nbut if he will look at the debates in the First Congress he\nwill see that there is no confusion at all, that the authors\nof the Second Amendment were talking about the right to\nserve. The real issue is that the Second Amendment addressed a\nserious public concern, to protect the right of citizens to\nserve as defenders of the community in times of peril, not\nthe personal uses of guns. But my main point was to show how far Congress can go in\ngun-control legislation without exceeding its constitutional\npowers, using as a model the legislation of the Second\nCongress, which included most of the authors of the Second\nAmendment. In th

In [113]:
# Rouge scores for KL Summary
for i in range(308):
    id_i = i+1
    kl_summ = es.get(index="duc_data", doc_type='test', id=id_i)['_source']['KL_Summary']
    gold_summ = es.get(index="duc_data", doc_type='test', id=id_i)['_source']['gold_summary']
    ops = check_output(["rouge.exe", kl_summ, gold_summ])
    print(json.loads(ops.decode('utf-8'))[0])

{'rouge-1': {'f': 0.1907356930974319, 'p': 1.0, 'r': 0.10542168674698796}, 'rouge-2': {'f': 0.12406947768671826, 'p': 0.9493670886075949, 'r': 0.06637168141592921}, 'rouge-l': {'f': 0.044467756263833494, 'p': 0.7160493827160493, 'r': 0.04430863254392666}}
{'rouge-1': {'f': 0.7535211219837333, 'p': 0.9553571428571429, 'r': 0.622093023255814}, 'rouge-2': {'f': 0.6635513973163596, 'p': 0.9466666666666667, 'r': 0.5107913669064749}, 'rouge-l': {'f': 0.48189499239126865, 'p': 0.9683544303797469, 'r': 0.43714285714285717}}
{'rouge-1': {'f': 0.36862744787574014, 'p': 0.9591836734693877, 'r': 0.22815533980582525}, 'rouge-2': {'f': 0.3055555528285699, 'p': 0.937984496124031, 'r': 0.18250377073906485}, 'rouge-l': {'f': 0.14393606017745403, 'p': 0.8161764705882353, 'r': 0.14050632911392405}}
{'rouge-1': {'f': 0.5058823490209343, 'p': 0.945054945054945, 'r': 0.3453815261044177}, 'rouge-2': {'f': 0.3892100159495993, 'p': 0.926605504587156, 'r': 0.24634146341463414}, 'rouge-l': {'f': 0.23185854665629

{'rouge-1': {'f': 0.33027522650923746, 'p': 0.96, 'r': 0.1994459833795014}, 'rouge-2': {'f': 0.2384500722587237, 'p': 0.9195402298850575, 'r': 0.136986301369863}, 'rouge-l': {'f': 0.09039587254895673, 'p': 0.6404494382022472, 'r': 0.08892355694227769}}
{'rouge-1': {'f': 0.02173913021975426, 'p': 0.01098901098901099, 'r': 1.0}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.00862132478951945, 'p': 0.008620689655172414, 'r': 1.0}}
{'rouge-1': {'f': 0.3090909064152893, 'p': 0.9714285714285714, 'r': 0.1837837837837838}, 'rouge-2': {'f': 0.20783132327406015, 'p': 0.9078947368421053, 'r': 0.11734693877551021}, 'rouge-l': {'f': 0.07410094775475212, 'p': 0.6363636363636364, 'r': 0.07324364723467862}}
{'rouge-1': {'f': 0.3462686538143907, 'p': 0.9830508474576272, 'r': 0.21014492753623187}, 'rouge-2': {'f': 0.23941067915373634, 'p': 0.9285714285714286, 'r': 0.13742071881606766}, 'rouge-l': {'f': 0.09589786221985412, 'p': 0.7162162162162162, 'r': 0.0944741532976827}}
{'rouge-1': {'

{'rouge-1': {'f': 0.21192052785469936, 'p': 0.9696969696969697, 'r': 0.11895910780669144}, 'rouge-2': {'f': 0.1371308003020572, 'p': 0.9154929577464789, 'r': 0.07411630558722919}, 'rouge-l': {'f': 0.05361720917880675, 'p': 0.7397260273972602, 'r': 0.0533596837944664}}
{'rouge-1': {'f': 0.5449438161362202, 'p': 0.9509803921568627, 'r': 0.38188976377952755}, 'rouge-2': {'f': 0.4682395606919609, 'p': 0.9416058394160584, 'r': 0.3115942028985507}, 'rouge-l': {'f': 0.32831508046695845, 'p': 0.9655172413793104, 'r': 0.3076923076923077}}
{'rouge-1': {'f': 0.36904761598836294, 'p': 0.9789473684210527, 'r': 0.2273838630806846}, 'rouge-2': {'f': 0.280052837693391, 'p': 0.9724770642201835, 'r': 0.16358024691358025}, 'rouge-l': {'f': 0.14876012806813005, 'p': 0.9727272727272728, 'r': 0.14597544338335608}}
{'rouge-1': {'f': 0.4421052596849031, 'p': 0.9882352941176471, 'r': 0.2847457627118644}, 'rouge-2': {'f': 0.36297640347113486, 'p': 0.9615384615384616, 'r': 0.22371364653243847}, 'rouge-l': {'f': 

{'rouge-1': {'f': 0.49411764318462137, 'p': 0.9402985074626866, 'r': 0.3351063829787234}, 'rouge-2': {'f': 0.36974789590628415, 'p': 0.9041095890410958, 'r': 0.2323943661971831}, 'rouge-l': {'f': 0.2054156067430515, 'p': 0.918918918918919, 'r': 0.19825072886297376}}
{'rouge-1': {'f': 0.41916167323084375, 'p': 0.958904109589041, 'r': 0.2681992337164751}, 'rouge-2': {'f': 0.32926828982459194, 'p': 0.9529411764705882, 'r': 0.19901719901719903}, 'rouge-l': {'f': 0.16905421089577574, 'p': 0.9534883720930233, 'r': 0.16498993963782696}}
{'rouge-1': {'f': 0.649999995528125, 'p': 0.9629629629629629, 'r': 0.49056603773584906}, 'rouge-2': {'f': 0.5527426118886752, 'p': 0.9424460431654677, 'r': 0.39104477611940297}, 'rouge-l': {'f': 0.3576365657791168, 'p': 0.9574468085106383, 'r': 0.33251231527093594}}
{'rouge-1': {'f': 0.3582089521886835, 'p': 0.96, 'r': 0.22018348623853212}, 'rouge-2': {'f': 0.2706766892446656, 'p': 0.9310344827586207, 'r': 0.15835777126099707}, 'rouge-l': {'f': 0.1459397731969

{'rouge-1': {'f': 0.2740157456439457, 'p': 0.9886363636363636, 'r': 0.1590493601462523}, 'rouge-2': {'f': 0.1946740110396899, 'p': 0.9636363636363636, 'r': 0.10827374872318693}, 'rouge-l': {'f': 0.04577922907291883, 'p': 0.5132743362831859, 'r': 0.045454545454545456}}
{'rouge-1': {'f': 0.33766233271630974, 'p': 0.3058823529411765, 'r': 0.37681159420289856}, 'rouge-2': {'f': 0.14213197475224837, 'p': 0.12844036697247707, 'r': 0.1590909090909091}, 'rouge-l': {'f': 0.181323692266936, 'p': 0.16666666666666666, 'r': 0.2111111111111111}}
{'rouge-1': {'f': 0.55555555154321, 'p': 1.0, 'r': 0.38461538461538464}, 'rouge-2': {'f': 0.4565587698276979, 'p': 0.9710144927536232, 'r': 0.2984409799554566}, 'rouge-l': {'f': 0.2807776817773625, 'p': 0.9668874172185431, 'r': 0.2664233576642336}}
{'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}
{'rouge-1': {'f': 0.14285714137126182, 'p': 0.8837209302325582, 'r': 0.077709611451

{'rouge-1': {'f': 0.2940226145936565, 'p': 0.9891304347826086, 'r': 0.17267552182163187}, 'rouge-2': {'f': 0.22649572440520221, 'p': 0.954954954954955, 'r': 0.12848484848484848}, 'rouge-l': {'f': 0.07679198068613141, 'p': 0.584070796460177, 'r': 0.07568807339449542}}
{'rouge-1': {'f': 0.37436761913328354, 'p': 0.9652173913043478, 'r': 0.23221757322175732}, 'rouge-2': {'f': 0.26579520449387933, 'p': 0.9172932330827067, 'r': 0.1554140127388535}, 'rouge-l': {'f': 0.10338739888776802, 'p': 0.6592592592592592, 'r': 0.10136674259681093}}
{'rouge-1': {'f': 0.42592592246913585, 'p': 0.9583333333333334, 'r': 0.27380952380952384}, 'rouge-2': {'f': 0.35168195415163334, 'p': 0.9426229508196722, 'r': 0.2161654135338346}, 'rouge-l': {'f': 0.16192873512653083, 'p': 0.7580645161290323, 'r': 0.15666666666666668}}
{'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}
{'rouge-1': {'f': 0.6829268246281975, 'p': 0.9333333333333333,

{'rouge-1': {'f': 0.09642301605025046, 'p': 0.8611111111111112, 'r': 0.051070840197693576}, 'rouge-2': {'f': 0.057197330075308916, 'p': 0.7692307692307693, 'r': 0.0297029702970297}, 'rouge-l': {'f': 0.03123252765059312, 'p': 0.875, 'r': 0.031194295900178252}}
{'rouge-1': {'f': 0.03394255833770767, 'p': 0.8125, 'r': 0.017333333333333333}, 'rouge-2': {'f': 0.012931034269563766, 'p': 0.6, 'r': 0.006535947712418301}, 'rouge-l': {'f': 0.0060501680778789, 'p': 0.625, 'r': 0.006049606775559589}}
{'rouge-1': {'f': 0.40718562536053643, 'p': 0.9444444444444444, 'r': 0.2595419847328244}, 'rouge-2': {'f': 0.31627906691790164, 'p': 0.918918918918919, 'r': 0.19101123595505617}, 'rouge-l': {'f': 0.15315250403638891, 'p': 0.9473684210526315, 'r': 0.15}}
{'rouge-1': {'f': 0.18487394783383945, 'p': 0.9565217391304348, 'r': 0.10232558139534884}, 'rouge-2': {'f': 0.12953367738882254, 'p': 0.8771929824561403, 'r': 0.06993006993006994}, 'rouge-l': {'f': 0.05472037789785209, 'p': 0.7166666666666667, 'r': 0.0

{'rouge-1': {'f': 0.058968058326672665, 'p': 0.8888888888888888, 'r': 0.030495552731893267}, 'rouge-2': {'f': 0.028629856476270272, 'p': 0.75, 'r': 0.014593467685892982}, 'rouge-l': {'f': 0.013694483624471092, 'p': 0.7586206896551724, 'r': 0.013690105787181083}}
{'rouge-1': {'f': 0.22790697458290968, 'p': 0.9245283018867925, 'r': 0.129973474801061}, 'rouge-2': {'f': 0.1587301571009206, 'p': 0.8870967741935484, 'r': 0.08716323296354993}, 'rouge-l': {'f': 0.060180446804944915, 'p': 0.6666666666666666, 'r': 0.059743954480796585}}
{'rouge-1': {'f': 0.5552050433078248, 'p': 1.0, 'r': 0.38427947598253276}, 'rouge-2': {'f': 0.451476789639837, 'p': 0.9553571428571429, 'r': 0.2955801104972376}, 'rouge-l': {'f': 0.1758525206963592, 'p': 0.6347826086956522, 'r': 0.16743119266055045}}
{'rouge-1': {'f': 0.043010752216007635, 'p': 0.8888888888888888, 'r': 0.02203856749311295}, 'rouge-2': {'f': 0.019886363370119207, 'p': 0.7368421052631579, 'r': 0.010079193664506839}, 'rouge-l': {'f': 0.0087404212208

{'rouge-1': {'f': 0.15946843690511145, 'p': 0.8888888888888888, 'r': 0.08759124087591241}, 'rouge-2': {'f': 0.11513859136937912, 'p': 0.7714285714285715, 'r': 0.06221198156682028}, 'rouge-l': {'f': 0.05985360852149261, 'p': 0.6521739130434783, 'r': 0.0594059405940594}}
{'rouge-1': {'f': 0.26161789779743516, 'p': 0.95, 'r': 0.15169660678642716}, 'rouge-2': {'f': 0.19782608496807658, 'p': 0.883495145631068, 'r': 0.11138310893512852}, 'rouge-l': {'f': 0.06906847084289143, 'p': 0.5688073394495413, 'r': 0.06820682068206821}}
{'rouge-1': {'f': 0.45768024715401784, 'p': 0.9605263157894737, 'r': 0.3004115226337449}, 'rouge-2': {'f': 0.3597430374698404, 'p': 0.8936170212765957, 'r': 0.225201072386059}, 'rouge-l': {'f': 0.17876750543804104, 'p': 0.7, 'r': 0.17114914425427874}}
{'rouge-1': {'f': 0.1743772224622282, 'p': 0.9074074074074074, 'r': 0.09645669291338582}, 'rouge-2': {'f': 0.10998877537993729, 'p': 0.8032786885245902, 'r': 0.05903614457831325}, 'rouge-l': {'f': 0.04257582156086896, 'p':

{'rouge-1': {'f': 0.26890756054630677, 'p': 0.927536231884058, 'r': 0.15724815724815724}, 'rouge-2': {'f': 0.20202019997792814, 'p': 0.875, 'r': 0.11419249592169657}, 'rouge-l': {'f': 0.10918522296434745, 'p': 0.9012345679012346, 'r': 0.10782865583456426}}
{'rouge-1': {'f': 0.06633906567477016, 'p': 0.9642857142857143, 'r': 0.03435114503816794}, 'rouge-2': {'f': 0.037119524448765415, 'p': 0.8620689655172413, 'r': 0.01896813353566009}, 'rouge-l': {'f': 0.011843498999042064, 'p': 0.5666666666666667, 'r': 0.011838440111420613}}
{'rouge-1': {'f': 0.23400935823686175, 'p': 0.9615384615384616, 'r': 0.13321492007104796}, 'rouge-2': {'f': 0.15652173755851478, 'p': 0.9101123595505618, 'r': 0.08562367864693446}, 'rouge-l': {'f': 0.064380425067782, 'p': 0.7282608695652174, 'r': 0.06393129770992366}}
{'rouge-1': {'f': 0.6618704990756172, 'p': 0.9583333333333334, 'r': 0.5054945054945055}, 'rouge-2': {'f': 0.6220095649713835, 'p': 0.948905109489051, 'r': 0.4626334519572954}, 'rouge-l': {'f': 0.42325

In [115]:
# Rouge scores for LDA Summary
for i in range(308):
    id_i = i+1
    lda_summ = es.get(index="duc_data", doc_type='test', id=id_i)['_source']['LDA_Summary']
    gold_summ = es.get(index="duc_data", doc_type='test', id=id_i)['_source']['gold_summary']
    ops = check_output(["rouge.exe", lda_summ, gold_summ])
    print(json.loads(ops.decode('utf-8'))[0])

{'rouge-1': {'f': 0.2671854710207406, 'p': 0.9626168224299065, 'r': 0.15512048192771086}, 'rouge-2': {'f': 0.20662460373846894, 'p': 0.9492753623188406, 'r': 0.11592920353982301}, 'rouge-l': {'f': 0.1049561381947785, 'p': 0.9714285714285714, 'r': 0.1038961038961039}}
{'rouge-1': {'f': 0.7535211219837333, 'p': 0.9553571428571429, 'r': 0.622093023255814}, 'rouge-2': {'f': 0.6635513973163596, 'p': 0.9466666666666667, 'r': 0.5107913669064749}, 'rouge-l': {'f': 0.48189499239126865, 'p': 0.9683544303797469, 'r': 0.43714285714285717}}
{'rouge-1': {'f': 0.3478260839313222, 'p': 0.9361702127659575, 'r': 0.21359223300970873}, 'rouge-2': {'f': 0.29620252894725213, 'p': 0.9212598425196851, 'r': 0.17647058823529413}, 'rouge-l': {'f': 0.1672250847945494, 'p': 0.9555555555555556, 'r': 0.16329113924050634}}
{'rouge-1': {'f': 0.5058823490209343, 'p': 0.945054945054945, 'r': 0.3453815261044177}, 'rouge-2': {'f': 0.3892100159495993, 'p': 0.926605504587156, 'r': 0.24634146341463414}, 'rouge-l': {'f': 0.23

{'rouge-1': {'f': 0.28235293861824223, 'p': 0.9375, 'r': 0.16620498614958448}, 'rouge-2': {'f': 0.20668692809489939, 'p': 0.918918918918919, 'r': 0.11643835616438356}, 'rouge-l': {'f': 0.1121012418219602, 'p': 0.9466666666666667, 'r': 0.11076443057722309}}
{'rouge-1': {'f': 0.02173913021975426, 'p': 0.01098901098901099, 'r': 1.0}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.00862132478951945, 'p': 0.008620689655172414, 'r': 1.0}}
{'rouge-1': {'f': 0.3153153125375376, 'p': 0.9459459459459459, 'r': 0.1891891891891892}, 'rouge-2': {'f': 0.25806451375237577, 'p': 0.9361702127659575, 'r': 0.14965986394557823}, 'rouge-l': {'f': 0.13837059813344543, 'p': 0.9578947368421052, 'r': 0.13602391629297458}}
{'rouge-1': {'f': 0.4573002718375339, 'p': 0.9540229885057471, 'r': 0.3007246376811594}, 'rouge-2': {'f': 0.34305317018026477, 'p': 0.9090909090909091, 'r': 0.21141649048625794}, 'rouge-l': {'f': 0.20461097197226666, 'p': 0.9487179487179487, 'r': 0.19786096256684493}}
{'rouge-1'

{'rouge-1': {'f': 0.2287581678087488, 'p': 0.9459459459459459, 'r': 0.13011152416356878}, 'rouge-2': {'f': 0.16580310715077454, 'p': 0.9090909090909091, 'r': 0.09122006841505131}, 'rouge-l': {'f': 0.08360163131509506, 'p': 0.9333333333333333, 'r': 0.08300395256916997}}
{'rouge-1': {'f': 0.5449438161362202, 'p': 0.9509803921568627, 'r': 0.38188976377952755}, 'rouge-2': {'f': 0.4682395606919609, 'p': 0.9416058394160584, 'r': 0.3115942028985507}, 'rouge-l': {'f': 0.32831508046695845, 'p': 0.9655172413793104, 'r': 0.3076923076923077}}
{'rouge-1': {'f': 0.36904761598836294, 'p': 0.9789473684210527, 'r': 0.2273838630806846}, 'rouge-2': {'f': 0.280052837693391, 'p': 0.9724770642201835, 'r': 0.16358024691358025}, 'rouge-l': {'f': 0.14876012806813005, 'p': 0.9727272727272728, 'r': 0.14597544338335608}}
{'rouge-1': {'f': 0.38147138649184426, 'p': 0.9722222222222222, 'r': 0.23728813559322035}, 'rouge-2': {'f': 0.29111530928963236, 'p': 0.9390243902439024, 'r': 0.17225950782997762}, 'rouge-l': {'f

{'rouge-1': {'f': 0.09538461442078107, 'p': 0.9393939393939394, 'r': 0.050243111831442464}, 'rouge-2': {'f': 0.05970149192310371, 'p': 0.9411764705882353, 'r': 0.030828516377649325}, 'rouge-l': {'f': 0.028253802489309792, 'p': 0.9428571428571428, 'r': 0.028229255774165955}}
{'rouge-1': {'f': 0.5250965211168588, 'p': 0.9577464788732394, 'r': 0.3617021276595745}, 'rouge-2': {'f': 0.41208790865837464, 'p': 0.9375, 'r': 0.2640845070422535}, 'rouge-l': {'f': 0.23392830552633426, 'p': 0.9506172839506173, 'r': 0.22448979591836735}}
{'rouge-1': {'f': 0.41916167323084375, 'p': 0.958904109589041, 'r': 0.2681992337164751}, 'rouge-2': {'f': 0.32926828982459194, 'p': 0.9529411764705882, 'r': 0.19901719901719903}, 'rouge-l': {'f': 0.16905421089577574, 'p': 0.9534883720930233, 'r': 0.16498993963782696}}
{'rouge-1': {'f': 0.649999995528125, 'p': 0.9629629629629629, 'r': 0.49056603773584906}, 'rouge-2': {'f': 0.5527426118886752, 'p': 0.9424460431654677, 'r': 0.39104477611940297}, 'rouge-l': {'f': 0.357

{'rouge-1': {'f': 0.33578431072484144, 'p': 0.9133333333333333, 'r': 0.2057057057057057}, 'rouge-2': {'f': 0.23918918688138244, 'p': 0.8984771573604061, 'r': 0.13795791114575215}, 'rouge-l': {'f': 0.12847530517476846, 'p': 0.9142857142857143, 'r': 0.12639894667544438}}
{'rouge-1': {'f': 0.2012578593890669, 'p': 0.7804878048780488, 'r': 0.11552346570397112}, 'rouge-2': {'f': 0.14827586038026164, 'p': 0.7962962962962963, 'r': 0.0817490494296578}, 'rouge-l': {'f': 0.07603598063134985, 'p': 0.8198198198198198, 'r': 0.07545605306799337}}
{'rouge-1': {'f': 0.3847283375144525, 'p': 0.9776119402985075, 'r': 0.23948811700182815}, 'rouge-2': {'f': 0.3202725696318445, 'p': 0.9641025641025641, 'r': 0.1920326864147089}, 'rouge-l': {'f': 0.1918812112488956, 'p': 0.9875518672199171, 'r': 0.1865203761755486}}
{'rouge-1': {'f': 0.2631578901113574, 'p': 0.2066115702479339, 'r': 0.36231884057971014}, 'rouge-2': {'f': 0.065040645811356, 'p': 0.05063291139240506, 'r': 0.09090909090909091}, 'rouge-l': {'f':

{'rouge-1': {'f': 0.37453183200150103, 'p': 0.9345794392523364, 'r': 0.234192037470726}, 'rouge-2': {'f': 0.2987341744997276, 'p': 0.921875, 'r': 0.1782477341389728}, 'rouge-l': {'f': 0.1809789719911671, 'p': 0.9457364341085271, 'r': 0.17604617604617603}}
{'rouge-1': {'f': 0.3206106842536129, 'p': 0.9655172413793104, 'r': 0.19221967963386727}, 'rouge-2': {'f': 0.25313282977644613, 'p': 0.9528301886792453, 'r': 0.14595375722543352}, 'rouge-l': {'f': 0.14170548673049677, 'p': 0.9629629629629629, 'r': 0.1392235609103079}}
{'rouge-1': {'f': 0.32653060938760725, 'p': 0.9454545454545454, 'r': 0.19734345351043645}, 'rouge-2': {'f': 0.2554973798470437, 'p': 0.9384615384615385, 'r': 0.1478787878787879}, 'rouge-l': {'f': 0.14850339562985776, 'p': 0.9548872180451128, 'r': 0.14564220183486237}}
{'rouge-1': {'f': 0.3071553200960013, 'p': 0.9263157894736842, 'r': 0.18410041841004185}, 'rouge-2': {'f': 0.21548821339193158, 'p': 0.9056603773584906, 'r': 0.12229299363057325}, 'rouge-l': {'f': 0.1153779

{'rouge-1': {'f': 0.22784809914124277, 'p': 0.9428571428571428, 'r': 0.12958115183246074}, 'rouge-2': {'f': 0.16905443958338806, 'p': 0.9147286821705426, 'r': 0.0931333859510655}, 'rouge-l': {'f': 0.069015473128603, 'p': 0.9548872180451128, 'r': 0.06868577609518658}}
{'rouge-1': {'f': 0.2727272702808706, 'p': 0.9557522123893806, 'r': 0.15905743740795286}, 'rouge-2': {'f': 0.21536144375598784, 'p': 0.9407894736842105, 'r': 0.12159863945578231}, 'rouge-l': {'f': 0.11667476310859837, 'p': 0.9681528662420382, 'r': 0.1152388172858226}}
{'rouge-1': {'f': 0.2736248212319941, 'p': 0.9509803921568627, 'r': 0.15980230642504117}, 'rouge-2': {'f': 0.22164048656439128, 'p': 0.9338235294117647, 'r': 0.12574257425742574}, 'rouge-l': {'f': 0.11918411580460116, 'p': 0.9635036496350365, 'r': 0.11764705882352941}}
{'rouge-1': {'f': 0.24242424022364586, 'p': 0.9629629629629629, 'r': 0.13866666666666666}, 'rouge-2': {'f': 0.1729372920735353, 'p': 0.9492753623188406, 'r': 0.09513435003631082}, 'rouge-l': {'

{'rouge-1': {'f': 0.21914008120182904, 'p': 0.9634146341463414, 'r': 0.12363067292644757}, 'rouge-2': {'f': 0.15607401297457882, 'p': 0.9509803921568627, 'r': 0.08501314636283962}, 'rouge-l': {'f': 0.08024289256792118, 'p': 0.9714285714285714, 'r': 0.0797498045347928}}
{'rouge-1': {'f': 0.5874125831793732, 'p': 0.9655172413793104, 'r': 0.4221105527638191}, 'rouge-2': {'f': 0.48815165495923274, 'p': 0.9537037037037037, 'r': 0.32802547770700635}, 'rouge-l': {'f': 0.3030914528763508, 'p': 0.9736842105263158, 'r': 0.2860824742268041}}
{'rouge-1': {'f': 0.33160621478160496, 'p': 0.9696969696969697, 'r': 0.2}, 'rouge-2': {'f': 0.26795283790168517, 'p': 0.9615384615384616, 'r': 0.15566625155666253}, 'rouge-l': {'f': 0.15243018325433555, 'p': 0.9772727272727273, 'r': 0.14947856315179606}}
{'rouge-1': {'f': 0.17129629467273771, 'p': 0.961038961038961, 'r': 0.0940279542566709}, 'rouge-2': {'f': 0.11103853577437509, 'p': 0.9239130434782609, 'r': 0.05906879777623349}, 'rouge-l': {'f': 0.0561853575

{'rouge-1': {'f': 0.2087542067267513, 'p': 0.9117647058823529, 'r': 0.11787072243346007}, 'rouge-2': {'f': 0.1501632193028094, 'p': 0.8625, 'r': 0.08224076281287247}, 'rouge-l': {'f': 0.07918969382828066, 'p': 0.8488372093023255, 'r': 0.07857911733046287}}
{'rouge-1': {'f': 0.30866807335571167, 'p': 0.9358974358974359, 'r': 0.1848101265822785}, 'rouge-2': {'f': 0.23562412109188408, 'p': 0.875, 'r': 0.13614262560777957}, 'rouge-l': {'f': 0.13630369143634555, 'p': 0.8490566037735849, 'r': 0.13353115727002968}}
{'rouge-1': {'f': 0.26614481158543357, 'p': 0.9066666666666666, 'r': 0.1559633027522936}, 'rouge-2': {'f': 0.21304926547314634, 'p': 0.8602150537634409, 'r': 0.12158054711246201}, 'rouge-l': {'f': 0.1211116572175291, 'p': 0.8285714285714286, 'r': 0.11901504787961696}}
{'rouge-1': {'f': 0.3212121183941231, 'p': 0.9464285714285714, 'r': 0.19343065693430658}, 'rouge-2': {'f': 0.22619047379850088, 'p': 0.8142857142857143, 'r': 0.1313364055299539}, 'rouge-l': {'f': 0.12697575773802972, 

{'rouge-1': {'f': 0.49577464402047217, 'p': 0.946236559139785, 'r': 0.33587786259541985}, 'rouge-2': {'f': 0.4166666630385488, 'p': 0.875, 'r': 0.2734375}, 'rouge-l': {'f': 0.2929762737014248, 'p': 0.8951612903225806, 'r': 0.27543424317617865}}
{'rouge-1': {'f': 0.5441696072519322, 'p': 0.9625, 'r': 0.3793103448275862}, 'rouge-2': {'f': 0.4498777469419719, 'p': 0.9292929292929293, 'r': 0.2967741935483871}, 'rouge-l': {'f': 0.30972030216994006, 'p': 0.9509803921568627, 'r': 0.2912912912912913}}
{'rouge-1': {'f': 0.26890756054630677, 'p': 0.927536231884058, 'r': 0.15724815724815724}, 'rouge-2': {'f': 0.20202019997792814, 'p': 0.875, 'r': 0.11419249592169657}, 'rouge-l': {'f': 0.10918522296434745, 'p': 0.9012345679012346, 'r': 0.10782865583456426}}
{'rouge-1': {'f': 0.12765957315024395, 'p': 0.9, 'r': 0.06870229007633588}, 'rouge-2': {'f': 0.09612625434926073, 'p': 0.881578947368421, 'r': 0.05083459787556904}, 'rouge-l': {'f': 0.05168671741308869, 'p': 0.9135802469135802, 'r': 0.051532033