In [None]:
import torch
from torch import nn, Tensor
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoConfig, AutoModelForCausalLM
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer
from tqdm import tqdm
import pandas as pd
import os
import csv
import json
from torch.nn.utils.rnn import pad_sequence

### Config

In [None]:
get_config_from_source = False
generate_test_response = False
ppo_training = True
rlhf_inference = True
print_evaluation = False

examples_to_compare = 5

max_length = 256

### PPO Training

In [None]:
if get_config_from_source:
  # Load the original model configuration
  original_model_name = "aisingapore/sea-lion-7b-instruct"
  original_config = AutoConfig.from_pretrained(original_model_name)

  # Save the configuration to the fine-tuned model directory
  original_config.save_pretrained("./ft-model")

In [None]:
# Load the configuration of the fine-tuned model
config = AutoConfig.from_pretrained("./ft-model")

class CustomValueHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        hidden_size = getattr(config, 'd_model', None)  # Use 'd_model' from the configuration
        if hidden_size is None:
            raise ValueError("The configuration does not have 'd_model' attribute.")
        self.summary = nn.Linear(hidden_size, 1)

    def forward(self, hidden_states: Tensor) -> Tensor:
        # Ensure input and weights have the same dtype
        hidden_states = hidden_states.to(self.summary.weight.dtype)
        # Flatten the tensor and ensure it matches the expected input shape for the linear layer
        batch_size, seq_len, hidden_size = hidden_states.size()
        x = hidden_states.view(batch_size * seq_len, hidden_size)
        x = self.summary(x)
        x = x.view(batch_size, seq_len, -1)
        return x

class CustomAutoModelForCausalLMWithValueHead(AutoModelForCausalLMWithValueHead):
    def __init__(self, pretrained_model, **kwargs):
        super(AutoModelForCausalLMWithValueHead, self).__init__(pretrained_model, **kwargs)
        self.v_head = CustomValueHead(pretrained_model.config)
        self.is_peft_model = False 

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        # Load the pretrained model
        pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        # Return the custom class
        return cls(pretrained_model, **kwargs)

model = CustomAutoModelForCausalLMWithValueHead.from_pretrained(
    "./ft-model",
    device_map="auto",
    trust_remote_code=True,
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
reward_tokenizer = AutoTokenizer.from_pretrained("w11wo/indonesian-roberta-base-sentiment-classifier")
reward_tokenizer.pad_token = reward_tokenizer.eos_token
reward_model = AutoModelForSequenceClassification.from_pretrained("./reward_model", num_labels=3, device_map="auto", load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

In [None]:
model_id = "aisingapore/sea-lion-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

In [None]:
# Load the dataset
dataset_path = "./inferences/ft_model_rl_data_for_PPO_scored.csv"
df = pd.read_csv(dataset_path)

# Extract the 'score' column
scores = df['score']

# Calculate min and max scores
min_score = scores.min()
max_score = scores.max()

print(f"Min Score: {min_score}")
print(f"Max Score: {max_score}")

# Function to get reward score
def get_reward_score(response):
    tokens = reward_tokenizer(response, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = tokens["input_ids"].to(reward_model.device)
    attention_mask = tokens["attention_mask"].to(reward_model.device)

    with torch.no_grad():
        outputs = reward_model(input_ids=input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    summed_logits = logits.sum(dim=-1).cpu().tolist()
    
    # Normalize the score between 0 and 1
    normalized_score = (summed_logits[0] - min_score) / (max_score - min_score)
    return normalized_score

In [None]:
# Load and process dataset
dataset_path = "./dataset/rl_data.json"
df = pd.read_json(dataset_path)

def extract_query(text):
    start_idx = text.find("### USER:")
    end_idx = text.find("### RESPONSE:")
    return text[start_idx:end_idx+13]

df['query'] = df['text'].apply(extract_query)
new_df = df[['query']]
dataset = Dataset.from_pandas(new_df)

# Tokenization with padding and truncation
def tokenize_function(example):
    return tokenizer(example["query"], padding="max_length", truncation=True, max_length=max_length, return_attention_mask=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Ensure padding token is set correctly
tokenizer.pad_token = tokenizer.eos_token

config = PPOConfig(
    model_name="./ft-model",
    learning_rate=1.41e-5,
    batch_size=8,
)

# PPO Trainer
ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 30,
}

# Initialize lists to store the tensors
query_tensors = []
response_tensors = []
rewards = []

# Function to get reward score
def get_reward_score(text):
    tokens = reward_tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    input_ids = tokens["input_ids"].to(reward_model.device)
    attention_mask = tokens["attention_mask"].to(reward_model.device)

    with torch.no_grad():
        outputs = reward_model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    rewards = logits.softmax(dim=-1)[:, 1].cpu().tolist()
    return rewards[0]

print(len(tokenized_dataset))
if ppo_training:
    x = 1
    for i in range(len(tokenized_dataset)):
        query = tokenized_dataset[i]["input_ids"]
        attention_mask = tokenized_dataset[i]["attention_mask"]

        input_ids = torch.tensor(query).unsqueeze(0).to(next(model.parameters()).device)
        attention_mask = torch.tensor(attention_mask).unsqueeze(0).to(next(model.parameters()).device)
        
        # Generate response
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            **generation_kwargs
        )
        
        response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        original_query = tokenizer.decode(query, skip_special_tokens=True)
        response = response.replace(original_query, "").strip()

        # Get reward score
        reward = get_reward_score(original_query + response)

        # Convert query and response to tensors
        query_tensor = torch.tensor(query).to(next(model.parameters()).device)
        response_tensor = torch.tensor(tokenizer.encode(response)).to(next(model.parameters()).device)
        
        # Ensure padding of response_tensor
        response_tensor = torch.nn.functional.pad(response_tensor, (0, max_length - response_tensor.size(0)), value=tokenizer.pad_token_id)

        # Store tensors and reward in lists
        query_tensors.append(query_tensor)
        response_tensors.append(response_tensor)
        rewards.append(torch.tensor([reward]).to(next(model.parameters()).device))
        if i % 30 == 0:
            print(i, "out of", len(tokenized_dataset))

    # Batch size
    batch_size = config.batch_size

    # Split data into batches and ensure they are lists of tensors
    for i in range(0, len(query_tensors), batch_size):
        query_batch = query_tensors[i:i + batch_size]
        response_batch = response_tensors[i:i + batch_size]
        reward_batch = rewards[i:i + batch_size]

        if len(query_batch) < batch_size:
            print("Skipping last batch due to less than batch size.")
            continue

        # Use these tensors with ppo_trainer
        stats = ppo_trainer.step(query_batch, response_batch, reward_batch)

        # Log stats
        ppo_trainer.log_stats(stats, {}, reward_batch)
        print("Batch processed:", i+8, "of", len(query_tensors))


In [None]:
# Save model
if ppo_training:
  ppo_trainer.save_pretrained("rlhf-model")

In [None]:
import os
import json
import csv
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from rouge import Rouge

# Check if rlhf_inference is True and ppo_training is False
if rlhf_inference and not ppo_training:
    model_name = "./rlhf-model"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, load_in_4bit=True)
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

    # Load the evaluation dataset
    test_dataset = load_dataset("json", data_files="./dataset/rl_eval.json")["train"]

    # Set up ROUGE scoring
    rouge = Rouge()
    rlhf_rouge1_scores = []
    rlhf_rouge2_scores = []
    rlhf_rougel_scores = []
    rlhf_model_responses = []

    # Ensure the output directory exists
    output_dir = "./inferences"
    os.makedirs(output_dir, exist_ok=True)
    csv_output_file = os.path.join(output_dir, "rlhf_model_responses.csv")

    # Create and write to the CSV file
    with open(csv_output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["prompt", "expected_response", "model", "response"])

        # Iterate through the evaluation dataset
        for example in test_dataset:
            # Extract prompt and expected response
            prompt = example["text"].split("### USER:\n")[1].split("\n\n### RESPONSE:\n")[0]
            expected_response = example["text"].split("### RESPONSE:\n")[1]

            # Generate model's response
            full_prompt = "### USER:\n" + prompt + "\n\n### RESPONSE:\n"
            tokens = tokenizer(full_prompt, return_tensors="pt")

            # Generate rlhf_model's response
            rlhf_output = model.generate(input_ids=tokens["input_ids"], max_new_tokens=30, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, num_return_sequences=1)
            rlhf_model_response = tokenizer.decode(rlhf_output[0], skip_special_tokens=True)

            # Remove the prompt from responses
            expected_response = expected_response.replace("<|endoftext|>", "").strip()
            rlhf_model_response = rlhf_model_response.replace(full_prompt, "").strip()

            # Handle extra and unused responses in the model's response
            while "###" in rlhf_model_response:
                rlhf_model_response = rlhf_model_response.split("###")[0].strip()
            while "\n\nK" in rlhf_model_response:
                rlhf_model_response = rlhf_model_response.split("\n\nK")[0].strip()
            while "Kalimat Awal:" in rlhf_model_response:
                rlhf_model_response = rlhf_model_response.split("Kalimat Awal:")[0].strip()
            while "Kalimat Baru:" in rlhf_model_response:
                rlhf_model_response = rlhf_model_response.split("Kalimat Baru:")[0].strip()
            while " ." in rlhf_model_response:
                rlhf_model_response = rlhf_model_response.split(" .")[0].strip()

            if rlhf_model_response == "":
                rlhf_model_response = "-"

            print("Expec:", expected_response)
            print("Model:", rlhf_model_response)

            # Calculate ROUGE scores for rlhf_model
            rlhf_scores = rouge.get_scores(rlhf_model_response, expected_response)[0]
            rlhf_rouge1_scores.append(rlhf_scores["rouge-1"]["f"])
            rlhf_rouge2_scores.append(rlhf_scores["rouge-2"]["f"])
            rlhf_rougel_scores.append(rlhf_scores["rouge-l"]["f"])

            writer.writerow([prompt, expected_response, "rlhf", rlhf_model_response])
            rlhf_model_responses.append(rlhf_model_response)

    # Save ROUGE scores to a JSON file
    output_file = "./eval_score/rlhf_rouge.json"
    data = {
        "RLHF Model Responses": rlhf_model_responses,
        "RLHF Model ROUGE Scores": {
            "ROUGE-1 Scores": rlhf_rouge1_scores,
            "ROUGE-2 Scores": rlhf_rouge2_scores,
            "ROUGE-L Scores": rlhf_rougel_scores
        }
    }

    with open(output_file, "w") as f:
        json.dump(data, f, indent=4)


In [None]:
if print_evaluation:
    test_dataset = load_dataset("json", data_files="./dataset/rl_eval.json")["train"]
    # Output file paths
    base_output_file = "./eval_score/base_rouge.json"
    ft_output_file = "./eval_score/ft_rouge.json"
    rlhf_output_file = "./eval_score/rlhf_rouge.json"

    # Read base model ROUGE scores
    with open(base_output_file, "r") as f_base:
        base_data = json.load(f_base)
        rouge1_base_scores = base_data["Base Model ROUGE Scores"]["ROUGE-1 Scores"]
        rouge2_base_scores = base_data["Base Model ROUGE Scores"]["ROUGE-2 Scores"]
        rougel_base_scores = base_data["Base Model ROUGE Scores"]["ROUGE-L Scores"]

    # Read ft_model ROUGE scores
    with open(ft_output_file, "r") as f_ft:
        ft_data = json.load(f_ft)
        rouge1_ft_scores = ft_data["Fine-Tuned Model ROUGE Scores"]["ROUGE-1 Scores"]
        rouge2_ft_scores = ft_data["Fine-Tuned Model ROUGE Scores"]["ROUGE-2 Scores"]
        rougel_ft_scores = ft_data["Fine-Tuned Model ROUGE Scores"]["ROUGE-L Scores"]

    # Read rlhf_model ROUGE scores
    with open(rlhf_output_file, "r") as f_rlhf:
        rlhf_data = json.load(f_rlhf)
        rouge1_rlhf_scores = rlhf_data["RLHF Model ROUGE Scores"]["ROUGE-1 Scores"]
        rouge2_rlhf_scores = rlhf_data["RLHF Model ROUGE Scores"]["ROUGE-2 Scores"]
        rougel_rlhf_scores = rlhf_data["RLHF Model ROUGE Scores"]["ROUGE-L Scores"]

    # Calculate average ROUGE scores for models
    avg_rouge1_base = sum(rouge1_base_scores) / len(rouge1_base_scores)
    avg_rouge2_base = sum(rouge2_base_scores) / len(rouge2_base_scores)
    avg_rougel_base = sum(rougel_base_scores) / len(rougel_base_scores)

    avg_rouge1_ft = sum(rouge1_ft_scores) / len(rouge1_ft_scores)
    avg_rouge2_ft = sum(rouge2_ft_scores) / len(rouge2_ft_scores)
    avg_rougel_ft = sum(rougel_ft_scores) / len(rougel_ft_scores)

    avg_rouge1_rlhf = sum(rouge1_rlhf_scores) / len(rouge1_rlhf_scores)
    avg_rouge2_rlhf = sum(rouge2_rlhf_scores) / len(rouge2_rlhf_scores)
    avg_rougel_rlhf = sum(rougel_rlhf_scores) / len(rougel_rlhf_scores)

    # Print average ROUGE scores for models
    print("Average ROUGE-1 F1 Score (model):", round(avg_rouge1_base, 4))
    print("Average ROUGE-2 F1 Score (model):", round(avg_rouge2_base, 4))
    print("Average ROUGE-L F1 Score (model):", round(avg_rougel_base, 4))

    print("Average ROUGE-1 F1 Score (ft_model):", round(avg_rouge1_ft, 4))
    print("Average ROUGE-2 F1 Score (ft_model):", round(avg_rouge2_ft, 4))
    print("Average ROUGE-L F1 Score (ft_model):", round(avg_rougel_ft, 4))

    print("Average ROUGE-1 F1 Score (rlhf_model):", round(avg_rouge1_rlhf, 4))
    print("Average ROUGE-2 F1 Score (rlhf_model):", round(avg_rouge2_rlhf, 4))
    print("Average ROUGE-L F1 Score (rlhf_model):", round(avg_rougel_rlhf, 4))

    # Print evaluation results from file
    for idx in range(examples_to_compare):
        example = test_dataset[idx]
        # Extract prompt and expected response
        text = example["text"]
        prompt_start = text.find("### USER:") + len("### USER:")
        prompt_end = text.find("### RESPONSE:")
        prompt = text[prompt_start:prompt_end].strip()
        expected_response_start = text.find("### RESPONSE:") + len("### RESPONSE:")
        expected_response = text[expected_response_start:].strip()
        expected_response = expected_response.replace("<|endoftext|>", "").strip()

        # Calculate ROUGE scores for base model, fine-tuned model, and RLHF model responses
        base_model_response = base_data["Base Model Responses"][idx]
        ft_model_response = ft_data["Fine-Tuned Model Responses"][idx]
        rlhf_model_response = rlhf_data["RLHF Model Responses"][idx]

        # Simplify base model response to only include the sentiment
        base_model_response = base_model_response.split("\n### RESPONSE:\n")[-1].strip()

        # Print the prompt and expected response
        print(f"\nExample {idx + 1}:")
        print("Prompt:", prompt)
        print("\nExpected   :", expected_response)

        # Print base model's response and ROUGE scores
        print("Base Model :", base_model_response)

        # Print fine-tuned model's response and ROUGE scores
        print("Fine-Tuned :", ft_model_response)

        # Print RLHF model's response and ROUGE scores
        print("RLHF Model :", rlhf_model_response)

        print("\n(Base Model):")
        print("ROUGE-1:", round(rouge1_base_scores[idx], 3))
        print("ROUGE-2:", round(rouge2_base_scores[idx], 3))
        print("ROUGE-L:", round(rougel_base_scores[idx], 3))

        print("(Fine-Tuned Model):")
        print("ROUGE-1:", round(rouge1_ft_scores[idx], 3))
        print("ROUGE-2:", round(rouge2_ft_scores[idx], 3))
        print("ROUGE-L:", round(rougel_ft_scores[idx], 3))

        print("(RLHF Model):")
        print("ROUGE-1:", round(rouge1_rlhf_scores[idx], 3))
        print("ROUGE-2:", round(rouge2_rlhf_scores[idx], 3))
        print("ROUGE-L:", round(rougel_rlhf_scores[idx], 3))

        print("\n-----------------------------------------")