In [1]:
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

import os
from tqdm import tqdm
import pandas as pd

In [2]:
LANGUAGE = "english"
SENTENCES_COUNT = 10
TEXTS_COUNT = 100

In [3]:
test_path_txt = '../SCOTUS_data/text'
test_path_sum = '../SCOTUS_data/summary'
target_path_csv = '../SCOTUS_data/paragraph_target_df_dev.csv'

In [4]:
summary_gen = []
texts = []

for file_name in tqdm(os.listdir(test_path_txt)[:TEXTS_COUNT]):
    text = open(os.path.join(test_path_txt, file_name), 'r').read()
    texts.append(text)
    parser = PlaintextParser.from_file(os.path.join(test_path_txt, file_name), Tokenizer(LANGUAGE))

    summarizer = LuhnSummarizer()
    summarizer.stop_words = get_stop_words(LANGUAGE)

    sentence_txt = ''

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        sentence_txt = sentence_txt + str(sentence) + "\n"
    summary_gen.append(sentence_txt)

df_target = pd.read_csv(target_path_csv)

100%|██████████| 100/100 [00:28<00:00,  3.50it/s]


In [5]:
summary_ref = []

for file_name in tqdm(os.listdir(test_path_sum)[:TEXTS_COUNT]):
    with open(os.path.join(test_path_sum, file_name), 'r') as f:
        text = f.read()
        summary_ref.append(text)

100%|██████████| 100/100 [00:00<00:00, 1169.35it/s]


In [6]:
print(len(summary_gen), len(summary_ref))
if len(summary_ref) != len(summary_gen):
    raise ValueError("Les listes summary_ref et summary_gen doivent avoir la même longueur.")

100 100


In [7]:
from rouge_score import rouge_scorer
from bert_score import BERTScorer

ROUGE_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
BERT_scorer = BERTScorer(lang="en")
ROUGE_scores = []
BERT_scores = []
for i in range(len(summary_gen)):
    score = ROUGE_scorer.score(summary_ref[i], summary_gen[i])
    ROUGE_scores.append(score)
    BERT_scores.append(BERT_scorer.score([summary_ref[i]], [summary_gen[i]]))
    # print(f"Scores pour le résumé {i+1} :", score)

# Moyennes des scores
avg_scores = {
    'rouge1': sum(s['rouge1'].fmeasure for s in ROUGE_scores) / len(ROUGE_scores),
    'rouge2': sum(s['rouge2'].fmeasure for s in ROUGE_scores) / len(ROUGE_scores),
    'rougeL': sum(s['rougeL'].fmeasure for s in ROUGE_scores) / len(ROUGE_scores),
    'bert_score': sum(s[2].mean().item() for s in BERT_scores) / len(BERT_scores),
}

print("Scores ROUGE :", avg_scores)

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Scores ROUGE : {'rouge1': 0.39702714015091645, 'rouge2': 0.14459409160699357, 'rougeL': 0.19431518930399994, 'bert_score': 0.8288299936056137}


In [8]:
import pandas as pd

df_results = pd.DataFrame({"Text": texts, "Reference": summary_ref, "Generated": summary_gen})
df_results.to_csv("./output/results_Luhn_dev.csv", index=False)

In [9]:
scores = {
    'facts_of_the_case': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'question': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'conclusion': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []}
}

for column_name in df_target.columns:
    for i in range(TEXTS_COUNT):
        ref = df_target[column_name].iloc[i]
        gen = summary_gen[i]

        # Scores ROUGE
        rouge_score = ROUGE_scorer.score(ref, gen)
        scores[column_name]['rouge1'].append(rouge_score['rouge1'].fmeasure)
        scores[column_name]['rouge2'].append(rouge_score['rouge2'].fmeasure)
        scores[column_name]['rougeL'].append(rouge_score['rougeL'].fmeasure)

        # Scores BERT
        _, _, bert_score = BERT_scorer.score([ref], [gen])
        scores[column_name]['bert_score'].append(bert_score.mean().item())

avg_scores_target = {
    col: {
        'rouge1': sum(scores[col]['rouge1']) / len(scores[col]['rouge1']),
        'rouge2': sum(scores[col]['rouge2']) / len(scores[col]['rouge2']),
        'rougeL': sum(scores[col]['rougeL']) / len(scores[col]['rougeL']),
        'bert_score': sum(scores[col]['bert_score']) / len(scores[col]['bert_score'])
    }
    for col in df_target.columns
}

for col, metrics in avg_scores_target.items():
    print(f"\nScores moyens pour {col} :")
    print(metrics)



Scores moyens pour facts_of_the_case :
{'rouge1': 0.17956847099890613, 'rouge2': 0.02665423027698347, 'rougeL': 0.10232946347352075, 'bert_score': 0.7919618076086045}

Scores moyens pour question :
{'rouge1': 0.04277783476418723, 'rouge2': 0.006286843429999269, 'rougeL': 0.03246765873340019, 'bert_score': 0.7889240062236786}

Scores moyens pour conclusion :
{'rouge1': 0.18839921114369687, 'rouge2': 0.029302424133872587, 'rougeL': 0.10442914409908592, 'bert_score': 0.7945915901660919}


In [10]:
df_avg_scores = pd.DataFrame([avg_scores])
df_avg_scores.index = ['global']

df_avg_scores_target = pd.DataFrame(avg_scores_target).T

df_score = pd.concat([df_avg_scores, df_avg_scores_target], axis=0)
print(df_score.head())

                     rouge1    rouge2    rougeL  bert_score
global             0.397027  0.144594  0.194315    0.828830
facts_of_the_case  0.179568  0.026654  0.102329    0.791962
question           0.042778  0.006287  0.032468    0.788924
conclusion         0.188399  0.029302  0.104429    0.794592


In [11]:
df_score.to_csv("./output/scores_Luhn_dev.csv")