In [1]:
%%capture
!pip install sumy
!pip install rouge-score

In [7]:
import sumy, os, nltk
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from rouge_score import rouge_scorer
import pandas as pd
from sumy.summarizers.text_rank import TextRankSummarizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
!gdown https://drive.google.com/uc?id=1epJmR0cAV65GIgflnmJpXHXwkC5BYuU6

Downloading...
From: https://drive.google.com/uc?id=1epJmR0cAV65GIgflnmJpXHXwkC5BYuU6
To: /content/BBC Business News.zip
  0% 0.00/928k [00:00<?, ?B/s]100% 928k/928k [00:00<00:00, 14.7MB/s]


In [9]:
%%capture
!unzip /content/BBC\ Business\ News.zip

In [10]:
article_path = "/content/BBC Business News/News Articles/business"
summary_path = "/content/BBC Business News/Summaries/business"
articles_names = os.listdir(article_path)


class Summarizer:
  def __init__(self):
    self.summarylist = []
    self.r1scoreslist = []
    self.r2scoreslist = []
    self.sentence_counts = []

  def summarizer(self, summarizerInstance, articles_names, sentence_counts):
    self.sentence_counts = sentence_counts
    summarizer = summarizerInstance
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'], use_stemmer=False)

    for sentence_count in sentence_counts:
      r1scores = dict()
      r2scores = dict()
      summaries = dict()
      for article_name in articles_names:
        article = open(article_path+"/"+article_name, "r").read()
        orig_summary = open(summary_path+"/"+article_name, "r").read()
        parser = PlaintextParser.from_string(article, Tokenizer("english"))
        summary = summarizer(parser.document, sentence_count)
        summary = " ".join([" ".join(i.words) for i in summary ])
        summaries[article_name] = summary
        scores = scorer.score(summary,orig_summary)
        r1scores[article_name] = scores["rouge1"]
        r2scores[article_name] = scores["rouge2"]
      self.r1scoreslist.append(r1scores)
      self.r2scoreslist.append(r2scores)
      self.summarylist.append(summaries)
      
  def rouge1_score(self):
    df_list = dict()
    for index in range(len(self.sentence_counts)):
      df_list[self.sentence_counts[index]] = pd.DataFrame(self.r1scoreslist[index], index=["Precision", "Recall", "Fmeasure"]).transpose()
    return df_list

  def rouge2_score(self):
    df_list = dict()
    for index in range(len(self.sentence_counts)):
      df_list[self.sentence_counts[index]] = pd.DataFrame(self.r2scoreslist[index], index=["Precision", "Recall", "Fmeasure"]).transpose()
    return df_list


In [11]:
lexsum = Summarizer()
lexsum.summarizer(LexRankSummarizer(), articles_names, [10,15,20,25])

In [12]:
print("LexRank Summarizer\n", "-"*200)
scores1 = lexsum.rouge1_score()
scores2 = lexsum.rouge2_score()

df = pd.DataFrame([scores1.keys(), [score.mean(axis =0).Precision for score in scores1.values()]
                  , [score.mean(axis =0).Precision for score in scores2.values()]],
                  index = ["Sentence count","Rogue 1 ", "Rogue 2"]).transpose()
df

LexRank Summarizer
 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


Unnamed: 0,Sentence count,Rogue 1,Rogue 2
0,10.0,0.771008,0.642559
1,15.0,0.877579,0.77557
2,20.0,0.908889,0.819366
3,25.0,0.918418,0.833272


In [13]:
textsum = Summarizer()
textsum.summarizer(TextRankSummarizer(), articles_names, [10,15,20,25])

In [14]:
print("TextRank Summarizer\n", "-"*200)

scores1 = textsum.rouge1_score()
scores2 = textsum.rouge2_score()

df = pd.DataFrame([scores1.keys(), [score.mean(axis =0).Precision for score in scores1.values()]
                  , [score.mean(axis =0).Precision for score in scores2.values()]],
                  index = ["Sentence count","Rogue 1 ", "Rogue 2"]).transpose()
df

TextRank Summarizer
 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


Unnamed: 0,Sentence count,Rogue 1,Rogue 2
0,10.0,0.829362,0.717919
1,15.0,0.897595,0.804859
2,20.0,0.916411,0.830498
3,25.0,0.921196,0.837647
