In [1]:
import re
import json
import pandas as pd
import spacy
import ollama
import os

from tqdm import tqdm
from datetime import datetime
now = lambda:(datetime.now().strftime("%Y/%m/%d-%H:%M:%S"))

In [2]:
cnn_df = pd.read_csv("dat/cnn_dailymail_test.csv")
llama_highlighted_df = pd.read_csv('./dat/cnn_dailymail_test-highlighted.csv')

In [3]:
from transformers import BartTokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

def count_token(text):
    return len(tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)[0])

def count_words(text):
    return len(text.split())

In [4]:
_cnn_article = cnn_df['article'][0]
_cnn_highlight = cnn_df['highlights'][0]
_ollama_highlight = llama_highlighted_df['article'][0]

_cnn_article_token = count_token(_cnn_article)
_cnn_highlight_token = count_token(_cnn_highlight)
_ollama_highlight_token = count_token(_ollama_highlight)

_cnn_article_words = count_words(_cnn_article)
_cnn_highlight_words = count_words(_cnn_highlight)
_ollama_highlight_words = count_words(_ollama_highlight)

# ==================================================
_cnn_ratio_token = _cnn_highlight_token/_cnn_article_token
_ollama_ratio_token = _ollama_highlight_token/_cnn_article_token
_cnn_ratio_words = _cnn_highlight_words/_cnn_article_words
_ollama_ratio_words = _ollama_highlight_words/_cnn_article_words

print(f"token ratio =  cnn   highlight({_cnn_highlight_token} token) / article({_cnn_article_token} token) = {_cnn_ratio_token}")
print(f"token ratio = ollama highlight({_ollama_highlight_token} token) / article({_cnn_article_token} token) = {_ollama_ratio_token}\n")

print(f"words ratio =  cnn   highlight({_cnn_highlight_words} words) / article({_cnn_article_words} words) = {_cnn_ratio_words}")
print(f"words ratio = ollama highlight({_ollama_highlight_words} words) / article({_cnn_article_words} words) = {_ollama_ratio_words}\n")

token ratio =  cnn   highlight(41 token) / article(694 token) = 0.059077809798270896
token ratio = ollama highlight(475 token) / article(694 token) = 0.6844380403458213

words ratio =  cnn   highlight(36 words) / article(567 words) = 0.06349206349206349
words ratio = ollama highlight(408 words) / article(567 words) = 0.7195767195767195



---

<center>
    <b style='font-size:100pt'>
        計算過程
    </b>
</center>

---

In [9]:
process_names = []
for file_path in os.listdir('dat'):
    if not file_path.endswith('.csv') or not file_path.startswith('cnn_dailymail_test'): continue
    if('token_count' in file_path): continue
    print(file_path)
    process_name = file_path.split('cnn_dailymail_test')[1].split('.csv')[0]
    if(process_name):
        process_names.append(process_name[1:])

cnn_dailymail_test-(原始)compress-bart.csv
cnn_dailymail_test-abstracted-bart.csv
cnn_dailymail_test-abstracted.csv
cnn_dailymail_test-compressed-bart.csv
cnn_dailymail_test-compressed.csv
cnn_dailymail_test-highlighted-bart.csv
cnn_dailymail_test-highlighted.csv
cnn_dailymail_test-ollama_cnn_highlight_token_ratio.csv
cnn_dailymail_test-ollama_cnn_highlight_words.csv
cnn_dailymail_test-ollama_cnn_highlight_words_ratio.csv
cnn_dailymail_test-ollama_highlight_token_ratio-bart.csv
cnn_dailymail_test-ollama_highlight_token_ratio.csv
cnn_dailymail_test-ollama_highlight_words-bart.csv
cnn_dailymail_test-ollama_highlight_words.csv
cnn_dailymail_test-ollama_highlight_words_ratio-bart.csv
cnn_dailymail_test-ollama_highlight_words_ratio.csv
cnn_dailymail_test-original-bart.csv
cnn_dailymail_test-original.csv
cnn_dailymail_test-summarized-bart.csv
cnn_dailymail_test-summarized.csv
cnn_dailymail_test.csv


In [10]:
dataset = "cnn_dailymail_test" # 這是檔案名稱前面的 dataset
dfs = {}
for process_name in process_names:
    if(process_name=='original'):
        df = pd.read_csv(f"./dat/{dataset}.csv")
    else:
        df = pd.read_csv(f"./dat/{dataset}-{process_name}.csv")
    dfs[process_name] = df
    #---
    print(f"{process_name:>35}:\t{', '.join(df.columns.tolist())} - {df.shape}")

                  (原始)compress-bart:	id, article - (11490, 2)
                    abstracted-bart:	id, article - (11490, 2)
                         abstracted:	id, article - (11490, 2)
                    compressed-bart:	id, article - (11490, 2)
                         compressed:	id, article - (11490, 2)
                   highlighted-bart:	id, article - (11490, 2)
                        highlighted:	id, article - (11490, 2)
   ollama_cnn_highlight_token_ratio:	id, article - (11490, 2)
         ollama_cnn_highlight_words:	id, article - (11490, 2)
   ollama_cnn_highlight_words_ratio:	id, article - (11490, 2)
  ollama_highlight_token_ratio-bart:	id, article - (11490, 2)
       ollama_highlight_token_ratio:	id, article - (11490, 2)
        ollama_highlight_words-bart:	id, article - (11490, 2)
             ollama_highlight_words:	id, article - (11490, 2)
  ollama_highlight_words_ratio-bart:	id, article - (11490, 2)
       ollama_highlight_words_ratio:	id, article - (11490, 2)
        

In [11]:
all_dat_file = os.listdir('dat')
for process_name in dfs:
    output_filepath = f'./dat/ratio-{process_name}.csv'
    if any ((file_name in output_filepath) for file_name in all_dat_file):
        continue
    df = dfs[process_name]
    total_len = df['article'].shape[0]
    token_ratios = []
    words_ratios = []
    article_key = 'article'
    if process_name=='original':
        article_key = 'highlights'
    print(now())
    for index in tqdm(range(total_len), desc=f"compress ratio-{process_name:<35}\t", total=total_len):
        token_ratio = count_token(df[article_key][index]) / count_token(cnn_df['article'][index])
        words_ratio = count_words(df[article_key][index]) / count_words(cnn_df['article'][index])
        token_ratios.append(token_ratio)
        words_ratios.append(words_ratio)
    ratio_df = pd.DataFrame({
        'token_ratio': token_ratios,
        'words_ratio': words_ratios
    })
    #---
    for retry in range(100):
        if not os.path.isfile(output_filepath): break
        output_filepath = f'./dat/ratio-{process_name}-copy{retry+1}.csv'
    ratio_df.to_csv(output_filepath, index=False)
    #---
    print(f'<token_ratio> = {ratio_df['token_ratio'].mean()}\n<words_ratio> = {ratio_df['words_ratio'].mean()}\n')
print(now())

2024/11/25-18:44:19


compress ratio-ollama_highlight_words_ratio-bart  	: 100%|██████████████████████| 11490/11490 [00:42<00:00, 267.38it/s]


<token_ratio> = 0.11746585988740932
<words_ratio> = 0.11178144646179852

2024/11/25-18:45:02
