In [2]:
import re
import json
import pandas as pd
import spacy
import ollama

from tqdm import tqdm

In [3]:
df = pd.read_csv("dat/cnn_dailymail_test.csv")

In [4]:
def ask_ollama(content, system, model='llama3.1'):
	response = ollama.chat(model=model, messages=[
		{ 'role': 'system', 'content': system },
		{ 'role': 'user', 'content': content },
	])
	return response['message']['content']

In [5]:
system_prompt_ratio = lambda ratio: f"""\
Please compress the following article to approximately {ratio} of its original token count, \
capturing only the main events and essential details. \
The output should be a direct, concise summary without any extra commentary or introductory text.\
"""

system_prompt_token = lambda token: f"""\
Please summarize the following article to approximately {token} tokens, \
capturing only the main events and essential details. \
The output should be a direct, concise summary without any extra commentary or introductory text.\
"""

system_prompt_words = lambda word: f"""\
Please summarize the following article to approximately {word} words, \
capturing only the main events and essential details. \
The output should be a direct, concise summary without any extra commentary or introductory text.\
"""

# 用 token 的「比例」和「數量」為目標，讓 llama3.1 改寫

In [6]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
def count_token(text):
    return len(tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)[0])

_article = df['article'][0]
_highlight = df['highlights'][0]

_article_token = count_token(_article)
_highlight_token = count_token(_highlight)
_ratio_token = _highlight_token / _article_token
print(f"token ratio = highlight({_highlight_token} token) / article({_article_token} token) = {_ratio_token}\n\n")

token_ratio_result = ask_ollama(_article, system_prompt_ratio(_ratio_token))
print(f"ratio result ({count_token(token_ratio_result)} token):\n{token_ratio_result}\n\n")

token_result = ask_ollama(_article, system_prompt_token(_highlight_token))
print(f"token result ({count_token(token_result)} token):\n{token_result}\n\n")

token ratio = highlight(41 token) / article(694 token) = 0.059077809798270896


ratio result (169 token):
The Palestinian Authority joined the International Criminal Court (ICC) as its 123rd member, giving the court jurisdiction over alleged crimes in Palestinian territories. The accession was marked with a ceremony at The Hague, where Palestinians signed the ICC's founding Rome Statute in January and accepted its jurisdiction since June 13, 2014. This move allows for possible war crimes investigations against Israelis, as well as subjecting Palestinians to counter-charges. Israel and the US opposed the Palestinian efforts, but Palestinian Foreign Minister Riad al-Malki hailed it as a step towards greater justice and peace. Rights group Human Rights Watch welcomed the development, urging governments to end pressure on Palestine for joining the ICC. The ICC will conduct a preliminary examination into alleged war crimes committed in Palestinian territories since June 2014, which could le

# 用 word 的「比例」和「數量」為目標，讓 llama3.1 改寫

In [7]:
def count_words(text):
    return len(text.split())

_article = df['article'][0]
_highlight = df['highlights'][0]

_article_words = count_words(_article)
_highlight_words = count_words(_highlight)
_ratio_words = _highlight_words / _article_words
print(f"words ratio = highlight({_highlight_words} words) / article({_article_words} words) = {_ratio_words}\n\n")

words_ratio_result = ask_ollama(_article, system_prompt_ratio(_ratio_words))
print(f"ratio result ({count_words(words_ratio_result)} words):\n{words_ratio_result}\n\n")

words_result = ask_ollama(_article, system_prompt_words(_highlight_words))
print(f"words result ({count_words(words_result)} words):\n{words_result}\n\n")

words ratio = highlight(36 words) / article(567 words) = 0.06349206349206349


ratio result (78 words):
Palestinians became the 123rd member of the International Criminal Court (ICC), giving it jurisdiction over alleged crimes in Palestinian territories. The ICC will now review evidence for possible investigations into war crimes committed by Israelis, with Palestinians potentially facing counter-charges. Israel and the US opposed Palestine's membership bid. A preliminary examination was opened last January, following Palestine's signing of the Rome Statute. The ICC sees "Palestine" as a state for its purposes, despite international disagreement on this definition.


words result (29 words):
The Palestinian Authority joined the International Criminal Court as its 123rd member, giving the court jurisdiction over alleged crimes in Palestinian territories, despite opposition from Israel and the US.




In [8]:
from rouge_score import rouge_scorer
def calculate_rouge_scores(reference_text, generated_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_text, generated_text)
    rouge_1 = scores['rouge1'].fmeasure
    rouge_2 = scores['rouge2'].fmeasure
    rouge_L = scores['rougeL'].fmeasure
    return rouge_1, rouge_2, rouge_L

token_ratio_rouge = calculate_rouge_scores(token_ratio_result, _highlight)
words_ratio_rouge = calculate_rouge_scores(words_ratio_result, _highlight)
token_rouge = calculate_rouge_scores(token_result, _highlight)
words_rouge = calculate_rouge_scores(words_result, _highlight)

print(f"""\
token_ratio_rouge : {token_ratio_rouge}
words_ratio_rouge : {words_ratio_rouge}
token_rouge       : {token_rouge}
words_rouge       : {words_rouge}
""")

token_ratio_rouge : (0.3181818181818182, 0.2068965517241379, 0.2613636363636364)
words_ratio_rouge : (0.4347826086956521, 0.17699115044247787, 0.2608695652173913)
token_rouge       : (0.4201680672268907, 0.22222222222222224, 0.319327731092437)
words_rouge       : (0.4444444444444445, 0.26229508196721313, 0.38095238095238093)



---

<center>
    <b style='font-size:100pt'>
        實驗過程
    </b>
</center>

---

# (完成)ollam 用 「cnn highlight 的 words 數量」為目標改寫
[2:18:57<00:00,  1.38it/s]

In [8]:
# SAVE_RESULT_PATH = "./dat/cnn_dailymail_test-ollama_words.dat"  # 儲存檔案名稱
# SAVE_EVERY_NUM = 5  # 每跑完幾筆資料就儲存
# TEMPORARY_SAVE_STRING = ""  # 暫時儲存字串
# START_INDEX = 0  # 開始的索引
# for index, row in tqdm(df.iloc[START_INDEX:].iterrows(), total=len(df) - START_INDEX, desc="highlight articles"):
#     try:
#         article_id = row['id']
#         article = row['article']
        
#         highlight_words = count_words(row['highlights'])
#         words_result = ask_ollama(article, system_prompt_words(highlight_words))
        
#         TEMPORARY_SAVE_STRING += f"{article_id}, {json.dumps(words_result)}\n"

#         if index % SAVE_EVERY_NUM == 0:
#             with open(SAVE_RESULT_PATH, "a") as file:
#                 file.write(TEMPORARY_SAVE_STRING)
#             TEMPORARY_SAVE_STRING = ""  # 清空暫存字串
#     except Exception as e:
#         print(f"Error processing row {index}: {e}")
# if TEMPORARY_SAVE_STRING:
#     with open(SAVE_RESULT_PATH, "a") as file:
#         file.write(TEMPORARY_SAVE_STRING)

# print("Processing complete and all summaries have been saved step-by-step in JSON.")

# (完成)ollam 用 「 llama highlight / cnn article 的 token 比例」為目標改寫
[5:09:48<00:00,  1.62s/it]

In [9]:
llama_highlighted = pd.read_csv('./dat/cnn_dailymail_test-highlighted.csv')

In [11]:
# SAVE_RESULT_PATH = "./dat/cnn_dailymail_test-ollama_highlight_token_ratio.dat"  # 儲存檔案名稱
# SAVE_EVERY_NUM = 5  # 每跑完幾筆資料就儲存
# TEMPORARY_SAVE_STRING = ""  # 暫時儲存字串
# START_INDEX = 0  # 開始的索引
# for index, row in tqdm(df.iloc[START_INDEX:].iterrows(), total=len(df) - START_INDEX, desc="highlight articles"):
#     try:
#         article_id = row['id']
#         article = row['article']
#         llama_highlighted_article = llama_highlighted['article'][index]

#         ratio = count_token(llama_highlighted_article) / count_token(article)
#         words_result = ask_ollama(article, system_prompt_ratio(f'{ratio:.2f}'))
        
#         TEMPORARY_SAVE_STRING += f"{article_id}, {json.dumps(words_result)}\n"

#         if index % SAVE_EVERY_NUM == 0:
#             with open(SAVE_RESULT_PATH, "a") as file:
#                 file.write(TEMPORARY_SAVE_STRING)
#             TEMPORARY_SAVE_STRING = ""  # 清空暫存字串
#     except Exception as e:
#         print(f"Error processing row {index}: {e}")
# if TEMPORARY_SAVE_STRING:
#     with open(SAVE_RESULT_PATH, "a") as file:
#         file.write(TEMPORARY_SAVE_STRING)

# print("Processing complete and all summaries have been saved step-by-step in JSON.")

# (完成)ollam 用 「 llama highlight 的 words 數量」為目標改寫
[9:53:52<00:00,  3.10s/it]

In [30]:
# SAVE_RESULT_PATH = "./dat/cnn_dailymail_test-ollama_highlight_words.dat"  # 儲存檔案名稱
# SAVE_EVERY_NUM = 5  # 每跑完幾筆資料就儲存
# TEMPORARY_SAVE_STRING = ""  # 暫時儲存字串
# START_INDEX = 0  # 開始的索引
# for index, row in tqdm(df.iloc[START_INDEX:].iterrows(), total=len(df) - START_INDEX, desc="rewrite articles"):
#     try:
#         article_id = row['id']
#         article = row['article']
#         llama_highlighted_article = llama_highlighted['article'][index]

#         llama_highlighted_words = count_words(llama_highlighted_article)
#         words_result = ask_ollama(article, system_prompt_words(llama_highlighted_words))
        
#         TEMPORARY_SAVE_STRING += f"{article_id}, {json.dumps(words_result)}\n"

#         if index % SAVE_EVERY_NUM == 0:
#             with open(SAVE_RESULT_PATH, "a") as file:
#                 file.write(TEMPORARY_SAVE_STRING)
#             TEMPORARY_SAVE_STRING = ""  # 清空暫存字串
#     except Exception as e:
#         print(f"Error processing row {index}: {e}")
# if TEMPORARY_SAVE_STRING:
#     with open(SAVE_RESULT_PATH, "a") as file:
#         file.write(TEMPORARY_SAVE_STRING)

# print("Processing complete and all summaries have been saved step-by-step in JSON.")

# (完成)ollam 用 「cnn highlight 的 token 比例」為目標改寫

[3:27:53]

In [13]:
# SAVE_RESULT_PATH = "./dat/cnn_dailymail_test-ollama_cnn_highlight_token_ratio.dat"  # 儲存檔案名稱
# SAVE_EVERY_NUM = 5  # 每跑完幾筆資料就儲存
# TEMPORARY_SAVE_STRING = ""  # 暫時儲存字串
# START_INDEX = 0  # 開始的索引
# for index, row in tqdm(df.iloc[START_INDEX:].iterrows(), total=len(df) - START_INDEX, desc="highlight articles"):
#     try:
#         article_id = row['id']
#         article = row['article']
#         cnn_highlights_article = row['highlights']
        
#         ratio = count_token(cnn_highlights_article) / count_token(article)
#         words_result = ask_ollama(article, system_prompt_ratio(f'{ratio:.2f}'))
        
#         TEMPORARY_SAVE_STRING += f"{article_id}, {json.dumps(words_result)}\n"

#         if index % SAVE_EVERY_NUM == 0:
#             with open(SAVE_RESULT_PATH, "a") as file:
#                 file.write(TEMPORARY_SAVE_STRING)
#             TEMPORARY_SAVE_STRING = ""  # 清空暫存字串
#     except Exception as e:
#         print(f"Error processing row {index}: {e}")
# if TEMPORARY_SAVE_STRING:
#     with open(SAVE_RESULT_PATH, "a") as file:
#         file.write(TEMPORARY_SAVE_STRING)

# print("Processing complete and all summaries have been saved step-by-step in JSON.")

# (完成) ollama 用 「cnn highlight 的 words 比例」為目標改寫

[3:25:07]

In [10]:
SAVE_RESULT_PATH = "./dat/cnn_dailymail_test-ollama_cnn_highlight_words_ratio.dat"  # 儲存檔案名稱
SAVE_EVERY_NUM = 5  # 每跑完幾筆資料就儲存
TEMPORARY_SAVE_STRING = ""  # 暫時儲存字串
START_INDEX = 0  # 開始的索引
for index, row in tqdm(df.iloc[START_INDEX:].iterrows(), total=len(df) - START_INDEX, desc="highlight articles"):
    try:
        article_id = row['id']
        article = row['article']
        cnn_highlights_article = row['highlights']
        
        ratio = count_words(cnn_highlights_article) / count_words(article)
        words_result = ask_ollama(article, system_prompt_ratio(f'{ratio:.2f}'))
        
        TEMPORARY_SAVE_STRING += f"{article_id}, {json.dumps(words_result)}\n"

        if index % SAVE_EVERY_NUM == 0:
            with open(SAVE_RESULT_PATH, "a") as file:
                file.write(TEMPORARY_SAVE_STRING)
            TEMPORARY_SAVE_STRING = ""  # 清空暫存字串
    except Exception as e:
        print(f"Error processing row {index}: {e}")
if TEMPORARY_SAVE_STRING:
    with open(SAVE_RESULT_PATH, "a") as file:
        file.write(TEMPORARY_SAVE_STRING)

print("Processing complete and all summaries have been saved step-by-step in JSON.")

highlight articles: 100%|██████████████████████████████████████████████████████| 11490/11490 [3:25:07<00:00,  1.07s/it]

Processing complete and all summaries have been saved step-by-step in JSON.





# (完成) ollama 用 llama highlight 的 words 比例」為目標改寫

[5:04:32]

In [11]:
SAVE_RESULT_PATH = "./dat/cnn_dailymail_test-ollama_highlight_words_ratio.dat"  # 儲存檔案名稱
SAVE_EVERY_NUM = 5  # 每跑完幾筆資料就儲存
TEMPORARY_SAVE_STRING = ""  # 暫時儲存字串
START_INDEX = 0  # 開始的索引
for index, row in tqdm(df.iloc[START_INDEX:].iterrows(), total=len(df) - START_INDEX, desc="highlight articles"):
    try:
        article_id = row['id']
        article = row['article']
        llama_highlighted_article = llama_highlighted['article'][index]
        
        ratio = count_words(llama_highlighted_article) / count_words(article)
        words_result = ask_ollama(article, system_prompt_ratio(f'{ratio:.2f}'))
        
        TEMPORARY_SAVE_STRING += f"{article_id}, {json.dumps(words_result)}\n"

        if index % SAVE_EVERY_NUM == 0:
            with open(SAVE_RESULT_PATH, "a") as file:
                file.write(TEMPORARY_SAVE_STRING)
            TEMPORARY_SAVE_STRING = ""  # 清空暫存字串
    except Exception as e:
        print(f"Error processing row {index}: {e}")
if TEMPORARY_SAVE_STRING:
    with open(SAVE_RESULT_PATH, "a") as file:
        file.write(TEMPORARY_SAVE_STRING)

print("Processing complete and all summaries have been saved step-by-step in JSON.")

highlight articles: 100%|██████████████████████████████████████████████████████| 11490/11490 [5:04:32<00:00,  1.59s/it]

Processing complete and all summaries have been saved step-by-step in JSON.





# 檢查 llama3.1 生成出來的壓縮比

In [31]:
def count_words(text):
    return len(text.split())
    
text = pd.read_csv("dat/cnn_dailymail_test-ollama_highlight_words.csv")

_article = text['article'][0]
_highlight = df['highlights'][0]

_article_words = count_words(_article)
_highlight_words = count_words(_highlight)
_ratio_words = _highlight_words / _article_words
print(f"words ratio = highlight({_highlight_words} words) / article({_article_words} words) = {_ratio_words}\n\n")

words_ratio_result = ask_ollama(_article, system_prompt_ratio(_ratio_words))
print(f"ratio result ({count_words(words_ratio_result)} words):\n{words_ratio_result}\n\n")

words_result = ask_ollama(_article, system_prompt_words(_highlight_words))
print(f"words result ({count_words(words_result)} words):\n{words_result}\n\n")

words ratio = highlight(36 words) / article(270 words) = 0.13333333333333333


ratio result (101 words):
Palestinian Authority became ICC's 123rd member, gaining jurisdiction over alleged crimes in Palestinian territories. They signed Rome Statute in January and accepted ICC jurisdiction over crimes since June 13, 2014. This allows for possible war crimes investigations against Israelis. Israel and US opposed the move, but Palestinian FM Riad al-Malki said it brings an end to impunity and injustice. Palestinians may be subject to counter-charges as well. Human Rights Watch welcomed the development, calling on governments to stop pressure against Palestine's ICC membership. ICC opened preliminary examination into situation in Palestinian territories in January, including alleged war crimes since June 2014.


words result (24 words):
Palestinians join ICC as its 123rd member, granting court jurisdiction over alleged crimes in occupied territories, allowing for possible war crimes investi

In [33]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
def count_token(text):
    return len(tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)[0])

text = pd.read_csv("dat/cnn_dailymail_test-ollama_highlight_token_ratio.csv")

_article = text['article'][0]
_highlight = df['highlights'][0]

_article_token = count_token(_article)
_highlight_token = count_token(_highlight)
_ratio_token = _highlight_token / _article_token
print(f"token ratio = highlight({_highlight_token} token) / article({_article_token} token) = {_ratio_token}\n\n")

token_ratio_result = ask_ollama(_article, system_prompt_ratio(_ratio_token))
print(f"ratio result ({count_token(token_ratio_result)} token):\n{token_ratio_result}\n\n")

token_result = ask_ollama(_article, system_prompt_token(_highlight_token))
print(f"token result ({count_token(token_result)} token):\n{token_result}\n\n")

token ratio = highlight(41 token) / article(247 token) = 0.1659919028340081


ratio result (105 token):
Palestine formally joined the International Criminal Court (ICC) on Wednesday, becoming its 123rd member. This gives the ICC jurisdiction over alleged crimes in Palestinian territories since June 2014, including East Jerusalem. The move allows Palestinians to be subject to counter-charges as well. Israel and the US opposed Palestine's membership bid but Palestinian Foreign Minister Riad al-Malki hailed it as a step towards justice. The ICC will now consider war crime allegations against Israelis, with rights group Human Rights Watch welcoming the development.


token result (77 token):
Palestinian Authority joins International Criminal Court, becoming 123rd member. This gives the court jurisdiction over alleged crimes in Palestinian territories. Ceremony held at The Hague. Palestinians may now face counter-charges. Israel and US opposed membership. Palestinian FM Riad al-Malki sees t