In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from rouge import Rouge
import os
import json
import csv
import time

### Config

In [None]:
fine_tune = False
saveModel = False
zero_shot_inference = False
ft_inference = True
print_evaluation = False
generate_4_way = False
rl_data_inference = False
examples_to_compare = 5

load_in_4bit = True

## Initialization

In [None]:
# Load the datasets
train_dataset = load_dataset("json", data_files="./dataset/ft_train.json")["train"]
val_dataset = load_dataset("json", data_files="./dataset/ft_test.json")["train"]
rl_dataset = load_dataset("json", data_files="./dataset/rl_data.json")["train"]
test_dataset = load_dataset("json", data_files="./dataset/rl_eval.json")["train"]

examples_to_compare = min(len(test_dataset), examples_to_compare)

# Print the sizes of each dataset
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))
print("RL dataset size:", len(rl_dataset))
print("Test dataset size:", len(test_dataset))

In [None]:
model_id = "aisingapore/sea-lion-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token

In [None]:
if saveModel:
  from huggingface_hub import login
  access_token = "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"
  login(token=access_token)

In [None]:
if fine_tune or zero_shot_inference:
  model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True, load_in_4bit=load_in_4bit)
  prompt_template = "### USER:\n{human_prompt}\n\n### RESPONSE:\n"

  prompt = """Ubah sentimen dari kalimat awal berikut menjadi sentimen sebaliknya
  Kalimat Awal: Hari ini langitnya sangat cerah dan ramah untuk jalan-jalan
  Kalimat Baru:"""

  full_prompt = prompt_template.format(human_prompt=prompt)

  tokens = tokenizer(full_prompt, return_tensors="pt")
  output = model.generate(input_ids=tokens["input_ids"], max_new_tokens=30, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, num_return_sequences=1)
  response_1 = tokenizer.decode(output[0], skip_special_tokens=True)
  print(response_1)

## Zero-Shot Inference on Test Dataset

In [None]:
if zero_shot_inference:
    rouge = Rouge()

    rouge1_scores = []
    rouge2_scores = []
    rougel_scores = []

    model_responses = []

    # Ensure the output directory exists
    output_dir = "./inferences"
    os.makedirs(output_dir, exist_ok=True)

    # Output file path
    csv_output_file = os.path.join(output_dir, "base_model_responses.csv")

    # Create and write to the CSV file
    with open(csv_output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        # Write the header
        writer.writerow(["prompt", "expected_response", "model", "response"])

        for example in test_dataset:
            # Extract prompt and expected response
            prompt = example["text"].split("### USER:\n")[1].split("\n\n### RESPONSE:\n")[0]
            expected_response = example["text"].split("### RESPONSE:\n")[1]

            # Generate model's response
            full_prompt = "### USER:\n" + prompt + "\n\n### RESPONSE:\n"
            tokens = tokenizer(full_prompt, return_tensors="pt")

            output = model.generate(input_ids=tokens["input_ids"], max_new_tokens=30, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, num_return_sequences=1)
            model_response = tokenizer.decode(output[0], skip_special_tokens=True)

            # Remove the prompt from responses
            expected_response = expected_response.replace("<|endoftext|>", "").strip()
            model_response = model_response.replace(full_prompt, "").strip()

            if model_response == "":
                model_response = "-"
            
            # Calculate ROUGE scores for model
            scores = rouge.get_scores(model_response, expected_response)[0]
            rouge1_scores.append(scores["rouge-1"]["f"])
            rouge2_scores.append(scores["rouge-2"]["f"])
            rougel_scores.append(scores["rouge-l"]["f"])

            # Append to CSV file
            writer.writerow([prompt, expected_response, "base", model_response])

    # Iterate through the test dataset again
    for idx in range(examples_to_compare):
        example = test_dataset[idx]
        # Extract prompt and expected response
        text = example["text"]
        prompt_start = text.find("### USER:") + len("### USER:")
        prompt_end = text.find("### RESPONSE:")
        prompt = text[prompt_start:prompt_end].strip()
        expected_response_start = text.find("### RESPONSE:") + len("### RESPONSE:")
        expected_response = text[expected_response_start:].strip()
        expected_response = expected_response.replace("<|endoftext|>", "").strip()

        # Generate model's response and save it
        tokens = tokenizer("### USER:\n" + prompt + "\n\n### RESPONSE:\n", return_tensors="pt")
        model_output = model.generate(input_ids=tokens["input_ids"], max_new_tokens=30, eos_token_id=tokenizer.eos_token_id)
        model_response = tokenizer.decode(model_output[0], skip_special_tokens=True)
        model_responses.append(model_response)

    # Output file path
    output_file = "./eval_score/base_rouge.json"
    
    # Ensure the output directory exists
    output_file_dir = "./eval_score"
    os.makedirs(output_file_dir, exist_ok=True)

    # Construct data dictionary
    data = {
        "Base Model Responses": model_responses,
        "Base Model ROUGE Scores": {
            "ROUGE-1 Scores": rouge1_scores,
            "ROUGE-2 Scores": rouge2_scores,
            "ROUGE-L Scores": rougel_scores
        }
    }

    # Write data to JSON file
    with open(output_file, "w") as f:
        json.dump(data, f, indent=4)

## Fine-Tuning SEA-LION 7B Instruct for Sentiment Conversion

In [None]:
if fine_tune:
    peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        target_modules= ["down_proj", "out_proj", "up_proj", "Wqkv"],
        bias="none",
        task_type="CAUSAL_LM",
    )

    ft_model = get_peft_model(model, peft_config)

    # Define tokenize function
    def tokenize_function(example):
        return tokenizer(example["text"], padding="max_length", truncation=True, max_length=81) # added max length 81 but hasn't been tested

    # Tokenize the datasets
    tokenized_train_datasets = train_dataset.map(tokenize_function, batched=True)
    tokenized_val_datasets = val_dataset.map(tokenize_function, batched=True)

    training_args = TrainingArguments(
                                    output_dir="train-fine-tune",
                                    num_train_epochs=5,
                                    learning_rate=5e-5,
                                    fp16=True,
                                    overwrite_output_dir = 'True',
                                    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, mlm_probability=0.15
    )

    trainer = Trainer(
        model=ft_model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_train_datasets,
        eval_dataset=tokenized_val_datasets,
        tokenizer=tokenizer
    )

    trainer_stats = trainer.train()

    if saveModel:
        ft_model.save_pretrained("./ft-model")
        trainer.push_to_hub("Adzka/fine-tune-sealion")

## Fine-Tuned Model Inference on Test Dataset

In [None]:
if not fine_tune and (ft_inference or rl_data_inference):
  ft_model = AutoModelForCausalLM.from_pretrained("./ft-model", device_map="auto", trust_remote_code=True, load_in_4bit=load_in_4bit)

In [None]:
if ft_inference and zero_shot_inference:
  prompt_template = "### USER:\n{human_prompt}\n\n### RESPONSE:\n"

  prompt = """Ubah sentimen dari kalimat awal berikut menjadi sentimen sebaliknya
  Kalimat Awal: Hari ini langitnya sangat cerah dan ramah untuk jalan-jalan
  Kalimat Baru:"""

  full_prompt = prompt_template.format(human_prompt=prompt)

  tokens = tokenizer(full_prompt, return_tensors="pt")
  output = ft_model.generate(input_ids=tokens["input_ids"], max_new_tokens=20, eos_token_id=tokenizer.eos_token_id)
  response_2 = tokenizer.decode(output[0], skip_special_tokens=True)

  print("Output:")
  print("--- Zero-Shot ---")
  print(response_1)
  print("\n--- Fine-Tuned ---")
  print(response_2)

### Generating responses from Test Dataset for ROUGE Scoring

In [None]:
if ft_inference:
    rouge = Rouge()

    ft_rouge1_scores = []
    ft_rouge2_scores = []
    ft_rougel_scores = []

    ft_model_responses = []

    # Ensure the output directory exists
    output_dir = "./inferences"
    os.makedirs(output_dir, exist_ok=True)
    # Output file path
    csv_output_file = os.path.join(output_dir, "ft_model_responses.csv")

    # Create and write to the CSV file
    with open(csv_output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write the header
        writer.writerow(["prompt", "expected_response", "model", "response"])

        # Iterate through the test dataset
        i = 0
        for example in test_dataset:
            i += 1
            # Extract prompt and expected response
            prompt = example["text"].split("### USER:\n")[1].split("\n\n### RESPONSE:\n")[0]
            expected_response = example["text"].split("### RESPONSE:\n")[1]

            # Generate model's response
            full_prompt = "### USER:\n" + prompt + "\n\n### RESPONSE:\n"
            tokens = tokenizer(full_prompt, return_tensors="pt")

            # Generate ft_model's response
            ft_output = ft_model.generate(input_ids=tokens["input_ids"], max_new_tokens=30, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, num_return_sequences=1)
            ft_model_response = tokenizer.decode(ft_output[0], skip_special_tokens=True)

            # Remove the prompt from responses
            expected_response = expected_response.replace("<|endoftext|>", "").strip()
            ft_model_response = ft_model_response.replace(full_prompt, "").strip()

            # Handle extra and unused responses in the model's response
            while "###" in ft_model_response:
                ft_model_response = ft_model_response.split("###")[0].strip()
            while "\n\nK" in ft_model_response:
                ft_model_response = ft_model_response.split("\n\nK")[0].strip()
            while "Kalimat Awal:" in ft_model_response:
                ft_model_response = ft_model_response.split("Kalimat Awal:")[0].strip()
            while "Kalimat Baru:" in ft_model_response:
                ft_model_response = ft_model_response.split("Kalimat Baru:")[0].strip()
            while " ." in ft_model_response:
                ft_model_response = ft_model_response.split(" .")[0].strip()
                
            if ft_model_response == "":
                ft_model_response = "-"

            print("Expec:", expected_response)
            print("Model:", ft_model_response)

            # Calculate ROUGE scores for ft_model
            ft_scores = rouge.get_scores(ft_model_response, expected_response)[0]
            ft_rouge1_scores.append(ft_scores["rouge-1"]["f"])
            ft_rouge2_scores.append(ft_scores["rouge-2"]["f"])
            ft_rougel_scores.append(ft_scores["rouge-l"]["f"])


            writer.writerow([prompt, expected_response, "fine-tuned", ft_model_response])
            if i <= 10:
                ft_model_responses.append(ft_model_response)

    # Output file path
    output_file = "./eval_score/ft_rouge.json"

    # Construct data dictionary
    data = {
        "Fine-Tuned Model Responses": ft_model_responses,
        "Fine-Tuned Model ROUGE Scores": {
            "ROUGE-1 Scores": ft_rouge1_scores,
            "ROUGE-2 Scores": ft_rouge2_scores,
            "ROUGE-L Scores": ft_rougel_scores
        }
    }

    # Write data to JSON file
    with open(output_file, "w") as f:
        json.dump(data, f, indent=4)

### Print Evaluation

In [None]:
if print_evaluation:
    # Output file paths
    base_output_file = "./eval_score/base_rouge.json"
    ft_output_file = "./eval_score/ft_rouge.json"

    # Read base model ROUGE scores
    with open(base_output_file, "r") as f_base:
        base_data = json.load(f_base)
        rouge1_base_scores = base_data["Base Model ROUGE Scores"]["ROUGE-1 Scores"]
        rouge2_base_scores = base_data["Base Model ROUGE Scores"]["ROUGE-2 Scores"]
        rougel_base_scores = base_data["Base Model ROUGE Scores"]["ROUGE-L Scores"]

    # Read ft_model ROUGE scores
    with open(ft_output_file, "r") as f_ft:
        ft_data = json.load(f_ft)
        rouge1_ft_scores = ft_data["Fine-Tuned Model ROUGE Scores"]["ROUGE-1 Scores"]
        rouge2_ft_scores = ft_data["Fine-Tuned Model ROUGE Scores"]["ROUGE-2 Scores"]
        rougel_ft_scores = ft_data["Fine-Tuned Model ROUGE Scores"]["ROUGE-L Scores"]

    # Calculate average ROUGE scores for model
    avg_rouge1_base = sum(rouge1_base_scores) / len(rouge1_base_scores)
    avg_rouge2_base = sum(rouge2_base_scores) / len(rouge2_base_scores)
    avg_rougel_base = sum(rougel_base_scores) / len(rougel_base_scores)

    # Calculate average ROUGE scores for ft_model
    avg_rouge1_ft = sum(rouge1_ft_scores) / len(rouge1_ft_scores)
    avg_rouge2_ft = sum(rouge2_ft_scores) / len(rouge2_ft_scores)
    avg_rougel_ft = sum(rougel_ft_scores) / len(rougel_ft_scores)

    # Print average ROUGE scores for model
    print("Average ROUGE-1 F1 Score (model):", round(avg_rouge1_base, 4))
    print("Average ROUGE-2 F1 Score (model):", round(avg_rouge2_base, 4))
    print("Average ROUGE-L F1 Score (model):", round(avg_rougel_base, 4))

    # Print average ROUGE scores for ft_model
    print("Average ROUGE-1 F1 Score (ft_model):", round(avg_rouge1_ft, 4))
    print("Average ROUGE-2 F1 Score (ft_model):", round(avg_rouge2_ft, 4))
    print("Average ROUGE-L F1 Score (ft_model):", round(avg_rougel_ft, 4))

    # Print evaluation results from file
    for idx in range(examples_to_compare):
        example = test_dataset[idx]
        # Extract prompt and expected response
        text = example["text"]
        prompt_start = text.find("### USER:") + len("### USER:")
        prompt_end = text.find("### RESPONSE:")
        prompt = text[prompt_start:prompt_end].strip()
        expected_response_start = text.find("### RESPONSE:") + len("### RESPONSE:")
        expected_response = text[expected_response_start:].strip()
        expected_response = expected_response.replace("<|endoftext|>", "").strip()


        # Calculate Rouge scores for model and fine-tuned model responses
        base_model_response = base_data["Base Model Responses"][idx]
        ft_model_response = ft_data["Fine-Tuned Model Responses"][idx]

        base_model_response = base_model_response.replace("### USER:\n"+prompt+"\n\n### RESPONSE:\n","").strip()
        ft_model_response = ft_model_response.replace("### USER:\n"+prompt+"\n\n### RESPONSE:\n","").strip()

        # Print the prompt and expected response
        print(f"\nExample {idx + 1}:")
        print("Prompt:", prompt)
        print("\nExpected   :", expected_response)

        # Print model's response and Rouge scores
        print("Base Model :", base_model_response)

        # Print fine-tuned model's response and Rouge scores
        print("Fine-Tuned :", ft_model_response)

        print("\n(Base Model):")
        print("ROUGE-1:", round(rouge1_base_scores[idx], 3))
        print("ROUGE-2:", round(rouge2_base_scores[idx], 3))
        print("ROUGE-L:", round(rougel_base_scores[idx], 3))

        print("(Fine-Tuned Model):")
        print("ROUGE-1:", round(rouge1_ft_scores[idx], 3))
        print("ROUGE-2:", round(rouge2_ft_scores[idx], 3))
        print("ROUGE-L:", round(rougel_ft_scores[idx], 3))

        print("\n-----------------------------------------")


#### Generating 4 Responses per Prompt for Reward Training from Validation Dataset

In [None]:
if generate_4_way:
    # Ensure the output directory exists
    output_dir = "./inferences"
    os.makedirs(output_dir, exist_ok=True)

    # Output file path
    csv_output_file = os.path.join(output_dir, "ft_model_4-way_comparison.csv")

    # Create and write to the CSV file
    n_test = len(val_dataset)
    with open(csv_output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write the header
        writer.writerow(["x", "y0", "y1", "y2", "y3"])

        # Iterate through the val dataset
        i = 0
        for example in val_dataset:
            i += 1
            # Extract prompt
            prompt = example["text"].split("### USER:\n")[1].split("\n\n### RESPONSE:\n")[0]

            # Generate model's response
            full_prompt = "### USER:\n" + prompt + "\n\n### RESPONSE:\n"
            tokens = tokenizer(full_prompt, return_tensors="pt")

            # Move input_ids to the same device as the model
            input_ids = tokens["input_ids"].to(ft_model.device)

            # Generate model's response
            ft_outputs = ft_model.generate(
                input_ids=input_ids, 
                max_new_tokens=50, 
                eos_token_id=tokenizer.eos_token_id, 
                pad_token_id=tokenizer.pad_token_id, 
                num_return_sequences=4,  # Generate 4 responses
                temperature=0.7,
                do_sample=True  # Ensure sampling to get diverse responses
            )

            # Decode the responses
            ft_responses = [tokenizer.decode(output, skip_special_tokens=True).replace(full_prompt, "").strip() for output in ft_outputs]

            # Clean responses
            clean_responses = []
            for response in ft_responses:
                for unwanted in ["###", "\n\nK", "Kalimat Awal:", "Kalimat Baru:", " ."]:
                    response = response.split(unwanted)[0].strip()
                clean_responses.append(response if response else "-")

            # Write the prompt and the four responses to the CSV file
            writer.writerow([prompt] + clean_responses)

            # Output progress
            print(f"Processed example {i} out of {n_test}")

    print("Responses successfully written to ft_model_4-way_comparison.csv")

#### Generating 4 Responses per Prompt for PPO from Reused Validation Dataset -- Akan dinilai oleh Reward Model

In [None]:
if rl_data_inference:
    # Ensure the output directory exists
    output_dir = "./inferences"
    os.makedirs(output_dir, exist_ok=True)

    # Output file path
    csv_output_file = os.path.join(output_dir, "ft_model_rl_data_for_PPO.csv")

    # Create and write to the CSV file
    n_test = len(rl_dataset)
    n_sample = 3  # Number of responses per prompt

    with open(csv_output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write the header
        writer.writerow(["query", "response"])

        # Iterate through the val dataset
        i = 0
        for example in rl_dataset:
            i += 1
            # Extract prompt
            prompt = example["text"].split("### USER:\n")[1].split("\n\n### RESPONSE:\n")[0]

            # Generate model's response
            full_prompt = "### USER:\n" + prompt + "\n\n### RESPONSE:\n"
            tokens = tokenizer(full_prompt, return_tensors="pt")

            # Move input_ids to the same device as the model
            input_ids = tokens["input_ids"].to(ft_model.device)

            # Generate model's response
            ft_outputs = ft_model.generate(
                input_ids=input_ids, 
                max_new_tokens=50, 
                eos_token_id=tokenizer.eos_token_id, 
                pad_token_id=tokenizer.pad_token_id, 
                num_return_sequences=n_sample,  # Generate specified number of responses
                temperature=0.7,
                do_sample=True  # Ensure sampling to get diverse responses
            )

            # Decode the responses
            ft_responses = [tokenizer.decode(output, skip_special_tokens=True).replace(full_prompt, "").strip() for output in ft_outputs]

            # Clean responses and ensure uniqueness
            clean_responses = []
            for response in ft_responses:
                for unwanted in ["###", "\n\nK", "Kalimat Awal:", "Kalimat Baru:", " ."]:
                    response = response.split(unwanted)[0].strip()
                if response and response not in clean_responses:
                    clean_responses.append(response)

            # Write the prompt and the unique responses to the CSV file
            for response in clean_responses:
                writer.writerow([prompt, response])

            # Output progress
            print(f"Processed example {i} out of {n_test}")
            if i % 30 == 0 and i != 0:
                time.sleep(20)

    print("Responses successfully written to ft_model_rl_data_for_PPO.csv")