In [47]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from tqdm import tqdm
from rouge_score import rouge_scorer
import os
from datetime import datetime
now = lambda:(datetime.now().strftime("%Y/%m/%d-%H:%M:%S"))

In [48]:
def calculate_rouge_scores(reference_text, generated_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, generated_text)
    rouge_1 = scores['rouge1'].fmeasure
    rouge_2 = scores['rouge2'].fmeasure
    rouge_L = scores['rougeL'].fmeasure
    return rouge_1, rouge_2, rouge_L

In [49]:
print(calculate_rouge_scores("Hello, I'm good!", "Hello, I'm good!"))
print(calculate_rouge_scores(
    "Wow, there is a cat.",
    "Wow, there is a dig."
))

(1.0, 1.0, 1.0)
(0.8000000000000002, 0.75, 0.8000000000000002)


Here, reading two files into `pandas` DataFrame
1. `df_CNN` means the daily mail from cnn, which is original dataset.
2. `df_HLT` means the highlighted daily mail, which is abbreviated from **H**igh**L**igh**T**.

In [50]:
df_CNN = pd.read_csv("./dat/cnn_dailymail_test.csv")
df_CNN

Unnamed: 0,article,highlights,id
0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...,f001ec5c4704938247d27a44948eebb37ae98d01
1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b...",230c522854991d053fe98a718b1defa077a8efef
2,"(CNN)If you've been following the news lately,...",Mohammad Javad Zarif has spent more time with ...,4495ba8f3a340d97a9df1476f8a35502bcce1f69
3,(CNN)Five Americans who were monitored for thr...,17 Americans were exposed to the Ebola virus w...,a38e72fed88684ec8d60dd5856282e999dc8c0ca
4,(CNN)A Duke student has admitted to hanging a ...,Student is no longer on Duke University campus...,c27cf1b136cc270023de959e7ab24638021bc43f
...,...,...,...
11485,Telecom watchdogs are to stop a rip-off that a...,Operators are charging up to 20p a minute - ev...,0ac776a4dc09ca97c136f4314fed4defb48a361a
11486,The chilling reenactment of how executions are...,Bali Nine ringleaders will face the firing squ...,fe89a6a2e28d173e5ad4c6d814c15b95aa969e3f
11487,It is a week which has seen him in deep water ...,Hardy was convicted of domestic abuse against ...,ded2f535cd6ab95d11b5f4ea29bbf2b2d3c55c50
11488,"Despite the hype surrounding its first watch, ...",Apple sold more than 61 million iPhones in the...,30ec5f280eee772a73d181bfc8514defd8026434


In [52]:
for file_path in os.listdir('dat'):
    if not file_path.endswith('.csv') or not file_path.startswith('cnn_dailymail_test'): continue
    print(file_path)

cnn_dailymail_test-(原始)compress-bart.csv
cnn_dailymail_test-abstracted-bart.csv
cnn_dailymail_test-abstracted.csv
cnn_dailymail_test-compressed-bart.csv
cnn_dailymail_test-compressed.csv
cnn_dailymail_test-highlighted-bart.csv
cnn_dailymail_test-highlighted.csv
cnn_dailymail_test-ollama_cnn_highlight_token_ratio.csv
cnn_dailymail_test-ollama_cnn_highlight_words.csv
cnn_dailymail_test-ollama_cnn_highlight_words_ratio.csv
cnn_dailymail_test-ollama_highlight_token_ratio-bart.csv
cnn_dailymail_test-ollama_highlight_token_ratio.csv
cnn_dailymail_test-ollama_highlight_words-bart.csv
cnn_dailymail_test-ollama_highlight_words.csv
cnn_dailymail_test-ollama_highlight_words_ratio-bart.csv
cnn_dailymail_test-ollama_highlight_words_ratio.csv
cnn_dailymail_test-original-bart.csv
cnn_dailymail_test-original.csv
cnn_dailymail_test-summarized-bart.csv
cnn_dailymail_test-summarized.csv
cnn_dailymail_test-token_count.csv
cnn_dailymail_test.csv


In [56]:
dataset = "cnn_dailymail_test"
process_names = []
for file_path in os.listdir('dat'):
    if not file_path.endswith('.csv') or not file_path.startswith('cnn_dailymail_test'): continue
    task_name = file_path.split(dataset)[1].split('.csv')[0]
    if not task_name: continue
    if 'token_count' in task_name: continue
    if '原始' in task_name: continue
    task_name = task_name.split('-',1)[1]
    rouge_scores_path = f'./dat/rouge_scores-{task_name}.csv'
    isProcess = os.path.isfile(rouge_scores_path)
    print(isProcess, end='\t')
    print(rouge_scores_path)
    if isProcess:
        continue
    process_names.append(task_name)

print('-'*100)
dfs = {}
for process_name in process_names:
    df = pd.read_csv(f"./dat/{dataset}-{process_name}.csv")
    dfs[process_name] = df
    #---
    print(f"{process_name:>35}:\t{', '.join(df.columns.tolist())} - {df.shape}")

True	./dat/rouge_scores-abstracted-bart.csv
True	./dat/rouge_scores-abstracted.csv
True	./dat/rouge_scores-compressed-bart.csv
True	./dat/rouge_scores-compressed.csv
True	./dat/rouge_scores-highlighted-bart.csv
True	./dat/rouge_scores-highlighted.csv
True	./dat/rouge_scores-ollama_cnn_highlight_token_ratio.csv
True	./dat/rouge_scores-ollama_cnn_highlight_words.csv
True	./dat/rouge_scores-ollama_cnn_highlight_words_ratio.csv
True	./dat/rouge_scores-ollama_highlight_token_ratio-bart.csv
True	./dat/rouge_scores-ollama_highlight_token_ratio.csv
True	./dat/rouge_scores-ollama_highlight_words-bart.csv
True	./dat/rouge_scores-ollama_highlight_words.csv
False	./dat/rouge_scores-ollama_highlight_words_ratio-bart.csv
True	./dat/rouge_scores-ollama_highlight_words_ratio.csv
True	./dat/rouge_scores-original-bart.csv
True	./dat/rouge_scores-original.csv
True	./dat/rouge_scores-summarized-bart.csv
True	./dat/rouge_scores-summarized.csv
----------------------------------------------------------------

In [57]:
# dataset = "cnn_dailymail_test" # 這是檔案名稱前面的 dataset
# process_names = [
#     # "original-bart",
#     # "original",
    
#     # "summarized",        # 完成
#     # "summarized-bart",   # 完成
    
#     # "compressed",        # 完成
#     # "compressed-bart",     # 完成
    
#     # "abstracted",        # 完成
#     # "abstracted-bart",   # 完成
    
#     # "highlighted",       # 完成 
#     # "highlighted-bart",  # 完成

#     # "ollama_words",
    
#     # "ollama_highlight_token_ratio",
#     # "ollama_highlight_token_ratio-bart",
    
#     # "ollama_highlight_words",
#     # "ollama_highlight_words-bart",

#     "ollama_cnn_highlight_token_ratio",
    
#     # "(原始)compress-bart" # 完成
# ]

# dfs = {}
# for process_name in process_names:
#     df = pd.read_csv(f"./dat/{dataset}-{process_name}.csv")
#     dfs[process_name] = df
#     #---
#     print(f"{process_name:>18}:\t{', '.join(df.columns.tolist())} - {df.shape}")

<b style='font-size:20pt;color:red'>底下這段是為了去除重複，有需要再跑</b>

In [58]:
# df0 = pd.read_csv(f'./dat/{dataset}.csv')
# ids0 = df0['id']
# for process_name in dfs:
#     df = dfs[process_name]
#     df_id = df['id']
#     _ids = []
#     _art = []
#     for id0 in tqdm(ids0):
#         unique_df = df[df_id==id0]
#         if not len(unique_df.index): continue
#         _article = unique_df['article'][unique_df.index[0]]
        
#         if _article.startswith('"') and _article.endswith('"'):
#             _article = _article[1:-1]
#         _ids.append(id0)
#         _art.append(_article)
            
#     df = pd.DataFrame({
#         "id": _ids,
#         "article":_art
#     })
#     display(df)
#     df.to_csv(f"./dat/{dataset}-{process_name}.csv", index=None)

In [59]:
# cnn_ids = df_CNN['id'].tolist()
# for process_name in dfs:
#     df = dfs[process_name]
#     for i in range(df.shape[0]):
#         if not df['id'][i] in cnn_ids:
#             df = df.iloc[i+1:]
#     dfs[process_name] = df
#     print(f"{process_name:>18}:\t{', '.join(df.columns.tolist())} - {df.shape}")

In [60]:
print(now())
cnn_highlights = df_CNN['highlights'].tolist()
cnn_articles = df_CNN['article'].tolist()

for process_name, df in dfs.items():
    total_len   = df.shape[0]
    articles    = df['article'].tolist()
    article_ids = df['id'].tolist()

    #isBartProcess = 'bart' in process_name
    isBartProcess = False

    rouge_scores = []
    for index in tqdm(range(total_len), desc=f"ROUGE-{process_name:<18}\t", total=total_len, bar_format='{l_bar}{bar:50}{r_bar}{bar:-10b}'):
        
        R1_hl, R2_hl, RL_hl = calculate_rouge_scores(articles[index], cnn_highlights[index]) # article v.s. cnn highlight
        rouge_score = {
            'id': 1,
            'rouge_1_hl': R1_hl,
            'rouge_2_hl': R2_hl,
            'rouge_L_hl': RL_hl,
        }
        
        if not isBartProcess:
            R1_at, R2_at, RL_at = calculate_rouge_scores(articles[index], cnn_articles[index])   # article v.s. cnn article
            rouge_score['rouge_1_article'] = R1_at
            rouge_score['rouge_2_article'] = R2_at
            rouge_score['rouge_L_article'] = RL_at
        rouge_scores.append(rouge_score)
    
    rouge_scores_df = pd.DataFrame(rouge_scores)
    #---
    output_filepath = f'./dat/rouge_scores-{process_name}.csv'
    for retry in range(100):
        if not os.path.isfile(output_filepath): break
        output_filepath = f'./dat/rouge_scores-{process_name}-copy{retry+1}.csv'
    #---
    rouge_scores_df.to_csv(output_filepath, index=False)
    print(f"{now()} complete, save to : {output_filepath}")

2024/11/25-18:46:55


ROUGE-ollama_highlight_words_ratio-bart	: 100%|██████████████████████████████████████████████████| 11490/11490 [03:38<0


2024/11/25-18:50:34 complete, save to : ./dat/rouge_scores-ollama_highlight_words_ratio-bart.csv
