In [1]:
import pandas as pd
import numpy as np
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import re, math, collections
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.reduction import ReductionSummarizer

In [2]:
# Block for KL Divergence Formula 
def tokenize(_str):
    stopwords = ['and', 'for', 'if', 'the', 'then', 'be', 'is', 'are', 'will', 'in', 'it', 'to', 'that']
    tokens = collections.defaultdict(lambda: 0.)
    for m in re.finditer(r"(\w+)", _str, re.UNICODE):
        m = m.group(1).lower()
        if len(m) < 2: continue
        if m in stopwords: continue
        tokens[m] += 1
 
    return tokens
#end of tokenize
 
def kldiv(_s, _t):
    if (len(_s) == 0):
        return 1e33
 
    if (len(_t) == 0):
        return 1e33
 
    ssum = 0. + sum(_s.values())
    slen = len(_s)
 
    tsum = 0. + sum(_t.values())
    tlen = len(_t)
 
    vocabdiff = set(_s.keys()).difference(set(_t.keys()))
    lenvocabdiff = len(vocabdiff)
 
    """ epsilon """
    epsilon = min(min(_s.values())/ssum, min(_t.values())/tsum) * 0.001
 
    """ gamma """
    gamma = 1 - lenvocabdiff * epsilon
 
    # print "_s: %s" % _s
    # print "_t: %s" % _t
 
    """ Check if distribution probabilities sum to 1"""
    sc = sum([v/ssum for k,v in _s.items()])
    st = sum([v/tsum for k,v in _t.items()])
 
    if sc < 9e-6:
        print ("Sum P: %e, Sum Q: %e" % (sc, st))
        print ("*** ERROR: sc does not sum up to 1. Bailing out ..")
        sys.exit(2)
    if st < 9e-6:
        print ("Sum P: %e, Sum Q: %e" % (sc, st))
        print ("*** ERROR: st does not sum up to 1. Bailing out ..")
        sys.exit(2)
 
    div = 0.
    for t, v in _s.items():
        pts = v / ssum
 
        ptt = epsilon
        if t in _t:
            ptt = gamma * (_t[t] / tsum)
 
        ckl = (pts - ptt) * math.log(pts / ptt)
 
        div +=  ckl
 
    return div

In [3]:
summarizer_kl   = KLSummarizer()
summarizer_lex  = LexRankSummarizer()
summarizer_luhn = LuhnSummarizer()
summarizer_lsa  = LsaSummarizer()
summarizer_tr   = TextRankSummarizer()
summarizer_red  = ReductionSummarizer()

In [4]:
data = pd.read_csv("data_requisite_fp_apm_ss_combined_with_tags.csv")

In [64]:
text = data.loc[12,"Article"]
text

'LONDON, Feb 9 (APM) - The U.S. Food and Drug Administration (FDA) has granted orphan drug designation to Epizyme’s tazemetostat for malignant rhabdoid tumours, a rare form of cancer. <EOP> The decision, which was announced in a statement on Monday, gives Epizyme various development incentives, including tax credits and marketing exclusivity for a period of seven years. For a product to qualify as an orphan drug it must be for a disease that affects 200,000 people in the U.S. <EOP> Malignant rhabdoid tumours are rare and aggressive cancers that are defined by the loss of the INI1 protein, or, in the case of tumours that affect the ovary, the loss of the SMARCA4 protein. The orphan designation applies in both circumstances. <EOP> According to Epizyme, in an ongoing Phase I study, tazemetostat has demonstrated “encouraging clinical activity and an acceptable safety profile” in patients with these types of cancer. <EOP> The drug is also being investigated in other cancers, including synov

In [65]:
# Parser on document for tokenized words
parser = PlaintextParser.from_string(text,Tokenizer("english"))

In [66]:
# Using KL summarizer
summary1 = summarizer_kl(parser.document,4)
summary1

(<Sentence: The orphan designation applies in both circumstances.>,
 <Sentence: <EOP> According to Epizyme, in an ongoing Phase I study, tazemetostat has demonstrated “encouraging clinical activity and an acceptable safety profile” in patients with these types of cancer.>,
 <Sentence: <EOP> The drug is also being investigated in other cancers, including synovial sarcoma and non-Hodgkin lymphoma.>,
 <Sentence: <EOP> tm/nh <EOP> thomas.meek@apmnews.com <EOP>>)

In [67]:
# Using LexRank summarizer
summary2 = summarizer_lex(parser.document, 4)
summary2

(<Sentence: LONDON, Feb 9 (APM) - The U.S. Food and Drug Administration (FDA) has granted orphan drug designation to Epizyme’s tazemetostat for malignant rhabdoid tumours, a rare form of cancer.>,
 <Sentence: <EOP> The decision, which was announced in a statement on Monday, gives Epizyme various development incentives, including tax credits and marketing exclusivity for a period of seven years.>,
 <Sentence: The orphan designation applies in both circumstances.>,
 <Sentence: <EOP> The drug is also being investigated in other cancers, including synovial sarcoma and non-Hodgkin lymphoma.>)

In [68]:
# Using Luhn summarizer
summary3 = summarizer_luhn(parser.document,4)
summary3

(<Sentence: LONDON, Feb 9 (APM) - The U.S. Food and Drug Administration (FDA) has granted orphan drug designation to Epizyme’s tazemetostat for malignant rhabdoid tumours, a rare form of cancer.>,
 <Sentence: For a product to qualify as an orphan drug it must be for a disease that affects 200,000 people in the U.S. <EOP> Malignant rhabdoid tumours are rare and aggressive cancers that are defined by the loss of the INI1 protein, or, in the case of tumours that affect the ovary, the loss of the SMARCA4 protein.>,
 <Sentence: The orphan designation applies in both circumstances.>,
 <Sentence: <EOP> According to Epizyme, in an ongoing Phase I study, tazemetostat has demonstrated “encouraging clinical activity and an acceptable safety profile” in patients with these types of cancer.>)

In [69]:
# Using LSA summarizer
summary4 = summarizer_lsa(parser.document,4)
summary4

(<Sentence: LONDON, Feb 9 (APM) - The U.S. Food and Drug Administration (FDA) has granted orphan drug designation to Epizyme’s tazemetostat for malignant rhabdoid tumours, a rare form of cancer.>,
 <Sentence: <EOP> The decision, which was announced in a statement on Monday, gives Epizyme various development incentives, including tax credits and marketing exclusivity for a period of seven years.>,
 <Sentence: <EOP> According to Epizyme, in an ongoing Phase I study, tazemetostat has demonstrated “encouraging clinical activity and an acceptable safety profile” in patients with these types of cancer.>,
 <Sentence: <EOP> The drug is also being investigated in other cancers, including synovial sarcoma and non-Hodgkin lymphoma.>)

In [70]:
summary5 =summarizer_tr(parser.document,4)
summary5

(<Sentence: LONDON, Feb 9 (APM) - The U.S. Food and Drug Administration (FDA) has granted orphan drug designation to Epizyme’s tazemetostat for malignant rhabdoid tumours, a rare form of cancer.>,
 <Sentence: <EOP> The decision, which was announced in a statement on Monday, gives Epizyme various development incentives, including tax credits and marketing exclusivity for a period of seven years.>,
 <Sentence: For a product to qualify as an orphan drug it must be for a disease that affects 200,000 people in the U.S. <EOP> Malignant rhabdoid tumours are rare and aggressive cancers that are defined by the loss of the INI1 protein, or, in the case of tumours that affect the ovary, the loss of the SMARCA4 protein.>,
 <Sentence: <EOP> According to Epizyme, in an ongoing Phase I study, tazemetostat has demonstrated “encouraging clinical activity and an acceptable safety profile” in patients with these types of cancer.>)

In [71]:
summary6 = summarizer_red(parser.document,4)
summary6

(<Sentence: LONDON, Feb 9 (APM) - The U.S. Food and Drug Administration (FDA) has granted orphan drug designation to Epizyme’s tazemetostat for malignant rhabdoid tumours, a rare form of cancer.>,
 <Sentence: <EOP> The decision, which was announced in a statement on Monday, gives Epizyme various development incentives, including tax credits and marketing exclusivity for a period of seven years.>,
 <Sentence: For a product to qualify as an orphan drug it must be for a disease that affects 200,000 people in the U.S. <EOP> Malignant rhabdoid tumours are rare and aggressive cancers that are defined by the loss of the INI1 protein, or, in the case of tumours that affect the ovary, the loss of the SMARCA4 protein.>,
 <Sentence: <EOP> The drug is also being investigated in other cancers, including synovial sarcoma and non-Hodgkin lymphoma.>)

In [72]:
def append_text_summary(summa):
    start = ""
    for su in summa:
        start += (str(su))
    return start

In [73]:
sum1 = append_text_summary(summary1)
sum2 = append_text_summary(summary2)
sum3 = append_text_summary(summary3)
sum4 = append_text_summary(summary4)
sum5 = append_text_summary(summary5)
sum6 = append_text_summary(summary6)

In [74]:
# Create list to store divergence values 
text_summa_li = []
summa_text_li = []

In [75]:
list_sum = [sum1,sum2,sum3,sum4,sum5,sum6]

In [76]:
for i in list_sum:
    text_summa_li.append(kldiv(tokenize(text), tokenize(i)))
    summa_text_li.append(kldiv(tokenize(i),tokenize(text)))

In [77]:
method_list = ['KL_Divergence Summarizer','Lex Summarizer','Luhn Summarizer','LSA_Summarizer','TextRank Summarizer','Reduction Summarizer']

In [80]:
metric_df                 = pd.DataFrame()
metric_df['Method']       = method_list
metric_df['Text_Sum_Div'] = text_summa_li
metric_df['Sum_Text_Div'] = summa_text_li
metric_df['Summary']      = list_sum

In [85]:
metric_df

Unnamed: 0,Method,Text_Sum_Div,Sum_Text_Div,Summary
0,KL_Divergence Summarizer,3.602572,0.449742,The orphan designation applies in both circums...
1,Lex Summarizer,2.827845,0.300537,"LONDON, Feb 9 (APM) - The U.S. Food and Drug A..."
2,Luhn Summarizer,2.050359,0.152696,"LONDON, Feb 9 (APM) - The U.S. Food and Drug A..."
3,LSA_Summarizer,1.861403,0.168475,"LONDON, Feb 9 (APM) - The U.S. Food and Drug A..."
4,TextRank Summarizer,1.038694,0.068307,"LONDON, Feb 9 (APM) - The U.S. Food and Drug A..."
5,Reduction Summarizer,1.402443,0.108615,"LONDON, Feb 9 (APM) - The U.S. Food and Drug A..."


In [87]:
metric_df['Summary'][4]

'LONDON, Feb 9 (APM) - The U.S. Food and Drug Administration (FDA) has granted orphan drug designation to Epizyme’s tazemetostat for malignant rhabdoid tumours, a rare form of cancer.<EOP> The decision, which was announced in a statement on Monday, gives Epizyme various development incentives, including tax credits and marketing exclusivity for a period of seven years.For a product to qualify as an orphan drug it must be for a disease that affects 200,000 people in the U.S. <EOP> Malignant rhabdoid tumours are rare and aggressive cancers that are defined by the loss of the INI1 protein, or, in the case of tumours that affect the ovary, the loss of the SMARCA4 protein.<EOP> According to Epizyme, in an ongoing Phase I study, tazemetostat has demonstrated “encouraging clinical activity and an acceptable safety profile” in patients with these types of cancer.'

In [88]:
metric_df['Summary'][5]

'LONDON, Feb 9 (APM) - The U.S. Food and Drug Administration (FDA) has granted orphan drug designation to Epizyme’s tazemetostat for malignant rhabdoid tumours, a rare form of cancer.<EOP> The decision, which was announced in a statement on Monday, gives Epizyme various development incentives, including tax credits and marketing exclusivity for a period of seven years.For a product to qualify as an orphan drug it must be for a disease that affects 200,000 people in the U.S. <EOP> Malignant rhabdoid tumours are rare and aggressive cancers that are defined by the loss of the INI1 protein, or, in the case of tumours that affect the ovary, the loss of the SMARCA4 protein.<EOP> The drug is also being investigated in other cancers, including synovial sarcoma and non-Hodgkin lymphoma.'

In [86]:
metric_df['Summary'][0]

'The orphan designation applies in both circumstances.<EOP> According to Epizyme, in an ongoing Phase I study, tazemetostat has demonstrated “encouraging clinical activity and an acceptable safety profile” in patients with these types of cancer.<EOP> The drug is also being investigated in other cancers, including synovial sarcoma and non-Hodgkin lymphoma.<EOP> tm/nh <EOP> thomas.meek@apmnews.com <EOP>'