## Experiment 1

In [None]:
# CUDA check
import torch
print(torch.cuda.is_available())

### Custom Tokenizer creation

In [None]:
from transformers import AutoTokenizer
import pandas as pd
import os

# Load Base Tokenizer
base_model_path = "pythainlp/wangchanglm-7.5B-sft-enth" 
print("Loading base tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)

#### custum words

In [None]:
# read the old file
with open("custom_vocab_5w1h.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# delete duplicated words and rearrange
unique_lines = sorted(set(line.strip() for line in lines if line.strip()))

print("\n----- words after preprocessing -----")
for line in unique_lines:
    print(line)

In [None]:
# save file
with open("custom_vocab_5w1h.txt", "w", encoding="utf-8") as file:
    for line in unique_lines:
        file.write(line + "\n")

print("\nNew words saved")

In [None]:
# load new words
with open("custom_vocab_5w1h.txt", "r", encoding="utf-8") as f:
    custom_tokens = [line.strip() for line in f.readlines() if line.strip()]

# add new words to tokenizer
num_added = tokenizer.add_tokens(custom_tokens)
print(f"‚úÖ Added {num_added} new tokens.")

# save new tokenizer 
tokenizer.save_pretrained("custom_tokenizer_5w1h")

### Step0: Load dataset 

In [None]:
import pandas as pd

df = pd.read_csv("Updated_Datasets.csv")
df.head(5)

### Step1: Load Custom Tokenizer

In [None]:
from transformers import AutoTokenizer

# ‚úÖ Load Custom Tokenizer
tokenizer_path = "./custom_tokenizer_5w1h" 
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

print("Custom Tokenizer loaded successfully.")
print("Vocab size:", len(tokenizer))


#### Tokenization Test

In [None]:
def tokenize_and_check(text):
    if not isinstance(text, str):
        return None  # ‡∏´‡∏£‡∏∑‡∏≠ ‡∏à‡∏∞ return {"original_text": None, "tokens": [], "num_tokens": 0} ‡∏Å‡πá‡πÑ‡∏î‡πâ
    encoding = tokenizer(text, add_special_tokens=False, return_tensors=None)
    token_ids = encoding['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    return {
        "original_text": text,
        "tokens": tokens,
        "num_tokens": len(tokens)
    }


In [None]:
# Tokenize using Custom Tokenizer
def tokenize_and_check(text):
    if not isinstance(text, str):
        return None  # ‡∏´‡∏£‡∏∑‡∏≠ ‡∏à‡∏∞ return {"original_text": None, "tokens": [], "num_tokens": 0} ‡∏Å‡πá‡πÑ‡∏î‡πâ
    encoding = tokenizer(text, add_special_tokens=False, return_tensors=None)
    token_ids = encoding['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    return {
        "original_text": text,
        "tokens": tokens,
        "num_tokens": len(tokens)
    }

tokenized_results_input1 = df['Input_Sec1'].apply(tokenize_and_check)

for idx, result in enumerate(tokenized_results_input1):
    print(f"\n==== Record {idx+1} ====")
    print(f"Original Text: {result['original_text'][:]}")  
    print(f"Number of Tokens: {result['num_tokens']}")
    print(f"Tokens: {result['tokens']}")
    if idx >= 2:
        break

In [None]:
tokenized_results_input2 = df['Input_Sec2'].apply(tokenize_and_check)

for idx, result in enumerate(tokenized_results_input2):
    print(f"\n==== Record {idx+1} ====")
    print(f"Original Text: {result['original_text'][:]}")  
    print(f"Number of Tokens: {result['num_tokens']}")
    print(f"Tokens: {result['tokens']}")
    if idx >= 2:
        break

### Step 2: Preprocessing

In [None]:
import re

def preprocess_text(text):
    # ‡∏ï‡∏±‡∏î‡πÅ‡∏ö‡πà‡∏á‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ‡πÄ‡∏ö‡∏∑‡πâ‡∏≠‡∏á‡∏ï‡πâ‡∏ô (‡∏à‡∏∏‡∏î, ‡πÄ‡∏ß‡πâ‡∏ô‡∏ß‡∏£‡∏£‡∏Ñ)
    sentences = re.split(r'(?<=[.])\s+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    
    # Boost Formality ‡πÄ‡∏•‡πá‡∏Å‡∏ô‡πâ‡∏≠‡∏¢: ‡πÅ‡∏Å‡πâ‡∏Ñ‡∏≥‡πÑ‡∏°‡πà‡πÄ‡∏õ‡πá‡∏ô‡∏ó‡∏≤‡∏á‡∏Å‡∏≤‡∏£ (‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á)
    replacements = {
        "‡∏ä‡πà‡∏ß‡∏¢": "‡∏Å‡∏£‡∏∏‡∏ì‡∏≤",
        "‡∏î‡∏π‡πÅ‡∏•": "‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö",
        "‡∏î‡∏π": "‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö",
        "‡∏ö‡∏≠‡∏Å": "‡πÅ‡∏à‡πâ‡∏á",
        "‡πÉ‡∏´‡πâ": "‡∏≠‡∏ô‡∏∏‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå",
    }
    boosted_sentences = []
    for sentence in sentences:
        for informal, formal in replacements.items():
            sentence = re.sub(rf'\b{informal}\b', formal, sentence)
        boosted_sentences.append(sentence)
    
    return " ".join(boosted_sentences)

df['Input_Sec1'] = df['Input_Sec1'].apply(preprocess_text)

for idx, row in df.iterrows():
    print(f"\nüßπ ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á Record {idx+1}")
    # print(f"Original Text:\n{row['combined_input'][:300]}...\n")
    print(f"Preprocessed Text:\n{row['Input_Sec1'][:300]}...")
    if idx >= 2:
        break

In [None]:
df['Input_Sec2'] = df['Input_Sec2'].apply(preprocess_text)

for idx, row in df.iterrows():
    print(f"\nüßπ ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á Record {idx+1}")
    # print(f"Original Text:\n{row['combined_input'][:300]}...\n")
    print(f"Preprocessed Text:\n{row['Input_Sec2'][:300]}...")
    if idx >= 2:
        break

#### Filtering

In [None]:
import re

# Expand abbreviation function
def expand_abbreviations(text, abbreviation_dict):
    for abbr, full_name in abbreviation_dict.items():
        pattern = r'\b' + re.escape(abbr) + r'\b'
        text = re.sub(pattern, f"{full_name} ({abbr})", text)
    return text

# Example abbreviation dictionary
abbreviation_dict = {
    "‡∏Å.‡∏Ñ.": "‡∏Å‡∏£‡∏Å‡∏é‡∏≤‡∏Ñ‡∏°",
    "‡∏Å.‡∏û.": "‡∏Å‡∏∏‡∏°‡∏†‡∏≤‡∏û‡∏±‡∏ô‡∏ò‡πå",
    "‡∏Å.‡∏¢.": "‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô",
    "‡∏Å‡∏Å‡∏ó.‡∏®‡∏ó‡∏ó.‡∏™‡∏™.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡πÇ‡∏ó‡∏£‡∏Ñ‡∏°‡∏ô‡∏≤‡∏Ñ‡∏° ‡∏®‡∏π‡∏ô‡∏¢‡πå‡∏Å‡∏≤‡∏£‡πÇ‡∏ó‡∏£‡∏Ñ‡∏°‡∏ô‡∏≤‡∏Ñ‡∏°‡∏ó‡∏´‡∏≤‡∏£ ‡∏Å‡∏£‡∏°‡∏Å‡∏≤‡∏£‡∏™‡∏∑‡πà‡∏≠‡∏™‡∏≤‡∏£‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏Å‡∏•.‡∏ö‡∏Å.‡∏™‡∏õ‡∏ó.": "‡∏Å‡∏≠‡∏á‡∏Å‡∏•‡∏≤‡∏á ‡∏Å‡∏≠‡∏á‡∏ö‡∏±‡∏ç‡∏ä‡∏≤‡∏Å‡∏≤‡∏£ ‡∏™‡∏ñ‡∏≤‡∏ö‡∏±‡∏ô‡∏ß‡∏¥‡∏ä‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ô‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®",
    "‡∏Å‡∏Ç‡∏™.‡∏¢‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏≠‡∏á‡∏Ç‡∏ô‡∏™‡πà‡∏á ‡∏¢‡∏∏‡∏ó‡∏ò‡∏ö‡∏£‡∏¥‡∏Å‡∏≤‡∏£‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏ï‡∏õ.‡∏™‡∏õ‡∏ä.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏≠‡∏á‡∏ï‡∏¥‡∏î‡∏ï‡∏≤‡∏°‡πÅ‡∏•‡∏∞‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏• ‡∏™‡∏≥‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡∏õ‡∏•‡∏±‡∏î‡∏ö‡∏±‡∏ç‡∏ä‡∏µ‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏ï‡∏™.‡∏™‡∏ï‡∏õ.": "‡∏Å‡∏≠‡∏á‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö ‡∏™‡∏≥‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏†‡∏≤‡∏¢‡πÉ‡∏ô",
    "‡∏Å‡∏ó‡∏î.‡∏ö‡∏Å.‡∏™‡∏õ‡∏ó.": "‡∏Å‡∏≠‡∏á‡∏ó‡∏î‡∏™‡∏≠‡∏ö ‡∏Å‡∏≠‡∏á‡∏ö‡∏±‡∏ç‡∏ä‡∏≤‡∏Å‡∏≤‡∏£ ‡∏™‡∏ñ‡∏≤‡∏ö‡∏±‡∏ô‡∏ß‡∏¥‡∏ä‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ô‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®",
    "‡∏Å‡∏ó‡∏û.‡∏Å‡∏û.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏≠‡∏á‡∏ó‡∏±‡∏û‡∏û‡∏¥‡πÄ‡∏®‡∏© ‡∏Å‡∏≠‡∏á‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏û‡∏•‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏ô‡∏ú.‡∏Å‡∏£.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏≠‡∏á‡∏ô‡πÇ‡∏¢‡∏ö‡∏≤‡∏¢‡πÅ‡∏•‡∏∞‡πÅ‡∏ú‡∏ô ‡∏Å‡∏£‡∏°‡∏Å‡∏¥‡∏à‡∏Å‡∏≤‡∏£‡∏û‡∏•‡πÄ‡∏£‡∏∑‡∏≠‡∏ô‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏ô‡∏ú.‡∏™‡∏ô‡∏û.‡∏Å‡∏û.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏≠‡∏á‡∏ô‡πÇ‡∏¢‡∏ö‡∏≤‡∏¢‡πÅ‡∏•‡∏∞‡πÅ‡∏ú‡∏ô ‡∏™‡∏≥‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡∏ô‡πÇ‡∏¢‡∏ö‡∏≤‡∏¢‡πÅ‡∏•‡∏∞‡πÅ‡∏ú‡∏ô ‡∏Å‡∏≠‡∏á‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏û‡∏•‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏û.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏≠‡∏á‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏û‡∏•‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏Å‡∏°‡∏®.‡∏ö‡∏Å.‡∏™‡∏õ‡∏ó.": "‡∏Å‡∏≠‡∏á‡∏°‡∏≤‡∏ï‡∏£‡∏ê‡∏≤‡∏ô‡∏Å‡∏≤‡∏£‡∏®‡∏∂‡∏Å‡∏©‡∏≤ ‡∏Å‡∏≠‡∏á‡∏ö‡∏±‡∏ç‡∏ä‡∏≤‡∏Å‡∏≤‡∏£ ‡∏™‡∏ñ‡∏≤‡∏ö‡∏±‡∏ô‡∏ß‡∏¥‡∏ä‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ô‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®",
    "‡∏Å‡∏£.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏£‡∏°‡∏Å‡∏¥‡∏à‡∏Å‡∏≤‡∏£‡∏û‡∏•‡πÄ‡∏£‡∏∑‡∏≠‡∏ô‡∏ó‡∏´‡∏≤‡∏£",
    "‡∏™‡∏ö.‡∏ó‡∏´‡∏≤‡∏£": "‡∏Å‡∏£‡∏°‡∏™‡∏≤‡∏£‡∏ö‡∏£‡∏£‡∏ì‡∏ó‡∏´‡∏≤‡∏£",
}


In [None]:
# ‡∏Ç‡∏¢‡∏≤‡∏¢‡∏ï‡∏±‡∏ß‡∏¢‡πà‡∏≠
df['Input_Sec2'] = df['Input_Sec2'].apply(lambda x: expand_abbreviations(x, abbreviation_dict))
df['Input_Sec1'] = df['Input_Sec1'].apply(lambda x: expand_abbreviations(x, abbreviation_dict))

In [None]:
def create_prompt_fewshot(input_sec2, input_sec1):
    prompt = f"""
    ‡∏Ñ‡∏∏‡∏ì‡∏Ñ‡∏∑‡∏≠‡∏ú‡∏π‡πâ‡∏ä‡πà‡∏ß‡∏¢‡∏™‡∏£‡∏∏‡∏õ‡∏´‡∏ô‡∏±‡∏á‡∏™‡∏∑‡∏≠‡∏£‡∏≤‡∏ä‡∏Å‡∏≤‡∏£
    ‡πÉ‡∏´‡πâ‡∏≠‡πà‡∏≤‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î ‡πÅ‡∏•‡∏∞‡∏™‡∏£‡∏∏‡∏õ‡πÄ‡∏ô‡∏∑‡πâ‡∏≠‡∏´‡∏≤‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Å‡∏£‡∏∞‡∏ä‡∏±‡∏ö ‡∏ä‡∏±‡∏î‡πÄ‡∏à‡∏ô ‡πÅ‡∏•‡∏∞‡πÄ‡∏õ‡πá‡∏ô‡∏†‡∏≤‡∏©‡∏≤‡∏ó‡∏≤‡∏á‡∏Å‡∏≤‡∏£
    ‡∏´‡πâ‡∏≤‡∏°‡πÅ‡∏ï‡πà‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏à‡∏≤‡∏Å‡∏ó‡∏µ‡πà‡∏Å‡∏≥‡∏´‡∏ô‡∏î
    ‡πÉ‡∏´‡πâ‡∏ô‡∏≥‡πÄ‡∏™‡∏ô‡∏≠‡πÄ‡∏õ‡πá‡∏ô‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡πà‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô ‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏à‡∏±‡∏î‡∏´‡∏°‡∏ß‡∏î‡∏´‡∏°‡∏π‡πà ‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏ï‡∏≠‡∏ö‡πÄ‡∏õ‡πá‡∏ô JSON

    ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡πÉ‡∏´‡∏°‡πà‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏™‡∏Å‡∏±‡∏î:
    \"\"\"{input_sec2}\"\"\"

    ‡πÄ‡∏´‡∏ï‡∏∏‡∏ú‡∏•‡πÄ‡∏ö‡∏∑‡πâ‡∏≠‡∏á‡∏´‡∏•‡∏±‡∏á:
    \"\"\"{input_sec1}\"\"\"

    ‡πÇ‡∏õ‡∏£‡∏î‡∏ï‡∏≠‡∏ö‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏™‡∏£‡∏∏‡∏õ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡πà‡∏≠‡πÄ‡∏õ‡πá‡∏ô‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô
    """
    return prompt


### Step 3: Experiment

#### Abstractive Mode: Ollama

In [None]:
# Config ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Ollama
Ollama_API_URL = "http://localhost:11434/api/chat"
Ollama_Model_Name = "wangchanglm"

In [None]:
import requests

def query_ollama_chat(prompt):
    payload = {
        "model": Ollama_Model_Name,
        "messages": [{"role": "user", "content": prompt}],
        "stream": False
    }
    try:
        response = requests.post(Ollama_API_URL, json=payload, timeout=120)
        response.raise_for_status()
        result = response.json()
        return result['message']['content']
    except Exception as e:
        print(f"‚ö†Ô∏è Error calling Ollama: {e}")
        return None


#### Extractive Mode

In [None]:
from transformers import AutoModel

# Config WangchanBERTa (‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö extractive)
model_name = "airesearch/wangchanberta-base-att-spm-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def extract_sentences(title, input_sec1, content, top_k=3):
    if pd.isnull(title) or pd.isnull(content):
        return ""
    anchor_text = title + " " + input_sec1
    sentences = re.split(r'(?<=[.!?])\s+', content)
    sentences = [s.strip() for s in sentences if s.strip()]
    if not sentences:
        return ""
    anchor_emb = get_sentence_embedding(anchor_text)
    sentence_embs = [get_sentence_embedding(sent) for sent in sentences]
    sims = [cosine_similarity(anchor_emb, sent_emb)[0][0] for sent_emb in sentence_embs]
    top_indices = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:top_k]
    selected_sentences = [sentences[i] for i in top_indices]
    return ' '.join(selected_sentences)

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

def rouge_l_score(ref, pred):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    return scorer.score(ref, pred)['rougeL'].fmeasure

def bleu_score(ref, pred):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)

#### Process

In [None]:
# EXPERIMENT_MODE = ["extractive", "abstractive"]

In [None]:
from tqdm import tqdm

extractive_outputs = []
abstractive_outputs = []
human_outputs = df['Output_Sec1'].tolist()

for idx, row in tqdm(df.iterrows(), total=len(df)):
    title = row['Title']
    input_sec1 = row['Input_Sec1']
    input_sec2 = row['Input_Sec2']

    # Extractive V2
    extracted = extract_sentences(title, input_sec1, input_sec2)
    extractive_outputs.append(extracted)

    # Abstractive V2
    input_sec2 = row['Input_Sec2']
    prompt = create_prompt_fewshot(input_sec2, input_sec1)
    # prompt = create_prompt_abstractive(input_sec2)
    summary = query_ollama_chat(prompt)
    if summary:
        abstractive_outputs.append(summary)
    else:
        abstractive_outputs.append("")

    # # Abstractive V2.1
    # summary = generate_summary_openthaigpt(input_sec2)
    # abstractive_outputs.append(summary)


### Step5: Evaluation

In [None]:
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

similarity_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') 

extractive_rouge = []
abstractive_rouge = []
extractive_bleu = []
abstractive_bleu = []

extractive_cosine = []
abstractive_cosine = []

for human, extr, abstr in tqdm(zip(human_outputs, extractive_outputs, abstractive_outputs), total=len(human_outputs)):
    # --- ROUGE-L ---
    extractive_rouge.append(rouge_l_score(human, extr))
    abstractive_rouge.append(rouge_l_score(human, abstr))

    # --- BLEU ---
    extractive_bleu.append(bleu_score(human, extr))
    abstractive_bleu.append(bleu_score(human, abstr))

    # --- Cosine Similarity ---
    emb_human = similarity_model.encode(human)
    emb_extr = similarity_model.encode(extr)
    emb_abstr = similarity_model.encode(abstr)

    cosine_extr = cosine_similarity([emb_human], [emb_extr])[0][0]
    cosine_abstr = cosine_similarity([emb_human], [emb_abstr])[0][0]

    extractive_cosine.append(cosine_extr)
    abstractive_cosine.append(cosine_abstr)


In [None]:
df_result = pd.DataFrame({
    "Human_Reference": human_outputs,
    "Extractive_Output": extractive_outputs,
    "Abstractive_Output": abstractive_outputs,
    "Extractive_ROUGE_L": extractive_rouge,
    "Abstractive_ROUGE_L": abstractive_rouge,
    "Extractive_BLEU": extractive_bleu,
    "Abstractive_BLEU": abstractive_bleu,
    "Extractive_Cosine": extractive_cosine,
    "Abstractive_Cosine": abstractive_cosine
})


df_result.head()

In [None]:
output_save_path = "Experiment1_Result.xlsx"
df_result.to_excel(output_save_path, index=False)

print(f"\n‚úÖ All processing done. Result saved to {output_save_path}")

In [None]:
# df.to_excel('output.xlsx', index=False, engine='openpyxl')

## Experiment 2

In [None]:
# ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏Ç‡πâ‡∏≠ 2 (Fact) ‡∏à‡∏≤‡∏Å‡∏Ç‡πâ‡∏≠ 1 ‡πÅ‡∏ö‡∏ö Fixed Template

def create_fixed_fact(input_sec1):
    fixed_fact = ""
    return fixed_fact


In [None]:
def create_prompt_for_section3(input_sec1, fixed_fact, with_feedback=False):
    base_prompt = f"""
<fact>
‡∏Ç‡πâ‡∏≠ ‡πë: {input_sec1}

‡∏Ç‡πâ‡∏≠ ‡πí: {fixed_fact}
</fact>

<instruction>
‡πÇ‡∏õ‡∏£‡∏î‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô‡∏Ç‡πâ‡∏≠‡πÄ‡∏™‡∏ô‡∏≠‡∏´‡∏£‡∏∑‡∏≠‡∏™‡∏±‡πà‡∏á‡∏Å‡∏≤‡∏£ (‡∏Ç‡πâ‡∏≠ ‡πì) ‡πÇ‡∏î‡∏¢‡∏õ‡∏è‡∏¥‡∏ö‡∏±‡∏ï‡∏¥‡∏ï‡∏≤‡∏°‡πÅ‡∏ô‡∏ß‡∏ó‡∏≤‡∏á‡∏ï‡πà‡∏≠‡πÑ‡∏õ‡∏ô‡∏µ‡πâ:
- ‡∏•‡∏≥‡∏î‡∏±‡∏ö‡πÄ‡∏õ‡πá‡∏ô‡∏Ç‡πâ‡∏≠ ‡πÜ: 2.1, 2.2, 2.3
- ‡∏´‡πâ‡∏≤‡∏°‡πÄ‡∏Å‡∏¥‡∏ô 3 ‡∏Ç‡πâ‡∏≠
- ‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏Ç‡πâ‡∏≠‡πÄ‡∏õ‡πá‡∏ô‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏Ñ‡∏Ñ‡∏≥‡∏™‡∏±‡πà‡∏á‡∏™‡∏±‡πâ‡∏ô ‡πÜ (1‚Äì2 ‡∏ö‡∏£‡∏£‡∏ó‡∏±‡∏î)
- ‡πÉ‡∏ä‡πâ‡∏†‡∏≤‡∏©‡∏≤‡∏£‡∏≤‡∏ä‡∏Å‡∏≤‡∏£‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏†‡∏≤‡∏û ‡πÅ‡∏•‡∏∞‡∏™‡∏≠‡∏î‡∏Ñ‡∏•‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ö‡∏Ç‡πâ‡∏≠ ‡πë ‡πÅ‡∏•‡∏∞‡∏Ç‡πâ‡∏≠ ‡πí
- ‡∏´‡πâ‡∏≤‡∏°‡∏°‡∏µ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏£‡∏∏‡∏õ‡∏ó‡πâ‡∏≤‡∏¢
</instruction>
"""

    if with_feedback:
        feedback_example = """
<example>
2.1 ‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡πÄ‡∏ä‡∏¥‡∏ç ‡∏ú‡∏ö.‡∏£‡∏£.‡∏ä‡∏ó. ‡∏´‡∏£‡∏∑‡∏≠‡∏ú‡∏π‡πâ‡πÅ‡∏ó‡∏ô‡πÄ‡∏Ç‡πâ‡∏≤‡∏£‡πà‡∏ß‡∏°‡∏û‡∏¥‡∏ò‡∏µ‡πÄ‡∏õ‡∏¥‡∏î‡∏Ø ‡∏ï‡∏≤‡∏°‡∏Ç‡πâ‡∏≠ 1
2.2 ‡∏Å‡∏™‡∏ô.‡∏Ø ‡∏à‡∏±‡∏î‡∏£‡∏ñ‡∏£‡∏±‡∏ö-‡∏™‡πà‡∏á ‡πÄ‡∏Ç‡πâ‡∏≤‡∏£‡πà‡∏ß‡∏°‡∏û‡∏¥‡∏ò‡∏µ‡πÄ‡∏õ‡∏¥‡∏î‡∏Ø
2.3 ‡∏ú‡∏ò‡∏Å.‡∏Ø ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏•‡∏á‡∏£‡∏∞‡∏ö‡∏ö‡∏™‡∏≤‡∏£‡∏ö‡∏£‡∏£‡∏ì‡∏≠‡∏¥‡πÄ‡∏•‡πá‡∏Å‡∏ó‡∏£‡∏≠‡∏ô‡∏¥‡∏Å‡∏™‡πå (ECM) ‡πÉ‡∏´‡πâ‡∏´‡∏ô‡πà‡∏ß‡∏¢‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á‡∏î‡∏≥‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡∏ï‡πà‡∏≠‡πÑ‡∏õ
</example>
"""
        base_prompt = feedback_example.strip() + "\n" + base_prompt.strip()

    return base_prompt.strip()


In [None]:
import numpy as np

def generate_best_section3(input_sec1, human_reference_section3, with_feedback=False):
    """
    Generate ‡∏Ç‡πâ‡∏≠ 3 ‡πÅ‡∏•‡∏∞‡∏Ñ‡∏∑‡∏ô‡∏Ñ‡πà‡∏≤: (Best Summary, Best Score)
    """
    # ‚úÖ ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Ç‡πâ‡∏≠ 2 ‡∏à‡∏≤‡∏Å‡∏Ç‡πâ‡∏≠ 1
    fixed_fact = create_fixed_fact(input_sec1)

    # ‚úÖ ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏° Prompt
    prompt = create_prompt_for_section3(input_sec1, fixed_fact, with_feedback=with_feedback)

    generated_texts = []
    scores = []

    for _ in range(3):  # ‡∏¢‡∏¥‡∏á 3 ‡∏Ñ‡∏£‡∏±‡πâ‡∏á
        gen_text = query_ollama_chat(prompt)
        if gen_text:
            generated_texts.append(gen_text)
            emb_human = similarity_model.encode(human_reference_section3)
            emb_generated = similarity_model.encode(gen_text)
            score = cosine_similarity([emb_human], [emb_generated])[0][0]
            scores.append(score)

    if not generated_texts:
        return "", 0.0

    best_idx = np.argmax(scores)
    best_text = generated_texts[best_idx]
    best_score = scores[best_idx]

    # (Optional) Postprocess
    # polished_text = rewrite_summary_wangchan(best_text)

    return best_text, best_score


In [None]:
# ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏™‡∏°‡∏°‡∏ï‡∏¥
input_sec1 = "‡∏¢‡∏ö.‡∏ó‡∏´‡∏≤‡∏£ (‡∏™‡∏ô‡∏û.‡∏¢‡∏ö.‡∏ó‡∏´‡∏≤‡∏£) ‡∏Å‡πç‡∏≤‡∏´‡∏ô‡∏î‡∏û‡∏¥‡∏ò‡∏µ‡πÄ‡∏õ‡∏¥‡∏î‡∏Å‡∏≤‡∏£‡∏ù‡∏∂‡∏Å‡∏≠‡∏ö‡∏£‡∏°‡∏´‡∏•‡∏±‡∏Å‡∏™‡∏π‡∏ï‡∏£‡∏ô‡∏≤‡∏¢‡∏ó‡∏´‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡∏ó‡∏ß‡∏ô ‡∏™‡∏≤‡∏¢‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏Å‡∏≤‡∏£‡πÅ‡∏û‡∏ó‡∏¢‡πå ‡∏£‡∏∏‡πà‡∏ô‡∏ó‡∏µ‡πà 3 ‡∏õ‡∏£‡∏∞‡∏à‡πç‡∏≤‡∏õ‡∏µ‡∏á‡∏ö‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì ‡∏û.‡∏®. 2568 ‡πÉ‡∏ô‡∏ß‡∏±‡∏ô‡∏û‡∏∏‡∏ò‡∏ó‡∏µ‡πà 5 ‡∏û.‡∏¢. 67 ‡πÄ‡∏ß‡∏•‡∏≤ 1300 ‡∏ì ‡∏´‡πâ‡∏≠‡∏á‡∏õ‡∏£‡∏∞‡∏ä‡∏∏‡∏° ‡∏´‡∏ó‡∏±‡∏¢‡∏ô‡πÄ‡∏£‡∏® ‡∏ä‡∏±‡πâ‡∏ô 2 ‡∏™‡∏ô‡∏û.‡∏¢‡∏ö.‡∏ó‡∏´‡∏≤‡∏£ (‡∏ö‡∏≤‡∏á‡∏ã‡πà‡∏≠‡∏ô) ‡πÇ‡∏î‡∏¢‡∏°‡∏µ ‡∏à‡∏Å.‡∏¢‡∏ö.‡∏ó‡∏´‡∏≤‡∏£ ‡πÄ‡∏õ‡πá‡∏ô‡∏õ‡∏£‡∏∞‡∏ò‡∏≤‡∏ô‡∏Ø ‡∏Å‡∏≤‡∏£‡πÅ‡∏ï‡πà‡∏á‡∏Å‡∏≤‡∏¢ ‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á‡πÅ‡∏ö‡∏ö‡∏õ‡∏Å‡∏ï‡∏¥ ‡∏Ñ‡∏≠‡∏û‡∏±‡∏ö‡πÅ‡∏Ç‡∏ô‡∏¢‡∏≤‡∏ß (‡∏ó‡∏≠. ‡∏≠‡∏¥‡∏ô‡∏ó‡∏£‡∏ò‡∏ô‡∏π‡πÅ‡∏Ç‡πá‡∏á) ‡∏á‡∏î‡∏´‡∏°‡∏ß‡∏Å"

human_reference_section3 = "" \
"2.1 ‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡πÄ‡∏ä‡∏¥‡∏ç ‡∏ú‡∏ö.‡∏£‡∏£.‡∏ä‡∏ó. ‡∏´‡∏£‡∏∑‡∏≠‡∏ú‡∏π‡πâ‡πÅ‡∏ó‡∏ô‡πÄ‡∏Ç‡πâ‡∏≤‡∏£‡πà‡∏ß‡∏°‡∏û‡∏¥‡∏ò‡∏µ‡πÄ‡∏õ‡∏¥‡∏î‡∏Ø ‡∏ï‡∏≤‡∏°‡∏Ç‡πâ‡∏≠ 1"
"2.2 ‡∏Å‡∏™‡∏ô.‡∏Ø ‡∏à‡∏±‡∏î‡∏£‡∏ñ‡∏£‡∏±‡∏ö-‡∏™‡πà‡∏á ‡πÄ‡∏Ç‡πâ‡∏≤‡∏£‡πà‡∏ß‡∏°‡∏û‡∏¥‡∏ò‡∏µ‡πÄ‡∏õ‡∏¥‡∏î‡∏Ø"
"2.3 ‡∏ú‡∏ò‡∏Å.‡∏Ø ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡∏•‡∏á‡∏£‡∏∞‡∏ö‡∏ö‡∏™‡∏≤‡∏£‡∏ö‡∏£‡∏£‡∏ì‡∏≠‡∏¥‡πÄ‡∏•‡πá‡∏Å‡∏ó‡∏£‡∏≠‡∏ô‡∏¥‡∏Å‡∏™‡πå (ECM) ‡πÉ‡∏´‡πâ‡∏´‡∏ô‡πà‡∏ß‡∏¢‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á‡∏î‡πç‡∏≤‡πÄ‡∏ô‡∏¥‡∏ô‡∏Å‡∏≤‡∏£‡∏ï‡πà‡∏≠‡πÑ‡∏õ" \
""

# ‚úÖ Generate ‡πÅ‡∏ö‡∏ö‡∏°‡∏µ Feedback Correction
section3_with_feedback, score_with_feedback = generate_best_section3(
    input_sec1, human_reference_section3, with_feedback=True
)

# ‚úÖ Generate ‡πÅ‡∏ö‡∏ö‡πÑ‡∏°‡πà‡∏°‡∏µ Feedback Correction
section3_without_feedback, score_without_feedback = generate_best_section3(
    input_sec1, human_reference_section3, with_feedback=False
)


# ‚úÖ ‡∏î‡∏π‡∏ú‡∏•
print("\n‚úÖ Section 3 (With Feedback Correction):")
print(section3_with_feedback)
print("Cosine Similarity:", score_with_feedback)

print("\n‚úÖ Section 3 (Without Feedback Correction):")
print(section3_without_feedback)
print("Cosine Similarity:", score_without_feedback)
