In [None]:
import torch
from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification
from datasets import Dataset
from sklearn.model_selection import train_test_split
from trl import RewardTrainer
import pandas as pd
import time

### Config

In [None]:
reward_training = False
saveModel = False
checkInitialAccuracy = False
score_ft_model_inferences = True

### Reward Training

In [None]:
if saveModel:
  from huggingface_hub import login
  access_token = "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"
  login(token=access_token)

In [None]:
# model_name = "cahya/distilbert-base-indonesian"
# model_name = "distilbert-base-uncased"
model_name = "w11wo/indonesian-roberta-base-sentiment-classifier"
if reward_training:
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
else:
    model = AutoModelForSequenceClassification.from_pretrained("./reward_model", num_labels=3, load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

In [None]:
# Load the RLHF dataset
if reward_training:
    dataset_path = "./dataset/4-way_comparison_labelled.csv"
    df = pd.read_csv(dataset_path)

    # Split the dataset
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=127)

    def prepare_rlhf_data(df):
        rows = []
        for idx, row in df.iterrows():
            prompt = row['x']
            chosen_idx = row['b']
            chosen_response = row[f'y{chosen_idx}']
            
            for i in range(4):
                if i != chosen_idx:
                    rejected_response = row[f'y{i}']
                    if chosen_response != rejected_response:
                        rows.append({
                            "instruction": prompt,
                            "chosen_response": chosen_response,
                            "rejected_response": rejected_response
                        })
        
        return Dataset.from_list(rows)

    # Prepare RLHF data for training and test sets
    train_rlhf_dataset = prepare_rlhf_data(train_df)
    test_rlhf_dataset = prepare_rlhf_data(test_df)

    # Define the formatting function
    def formatting_func(examples):
        input_ids_chosen = []
        attention_mask_chosen = []
        input_ids_rejected = []
        attention_mask_rejected = []
        
        kwargs = {"padding": "max_length", "truncation": True, "max_length": 512, "return_tensors": "pt"}

        for instruction, chosen_response, rejected_response in zip(examples["instruction"], examples["chosen_response"], examples["rejected_response"]):
            prompt_plus_chosen_response = instruction + "\n" + chosen_response
            prompt_plus_rejected_response = instruction + "\n" + rejected_response
            tokens_chosen = tokenizer(prompt_plus_chosen_response, **kwargs)
            tokens_rejected = tokenizer(prompt_plus_rejected_response, **kwargs)
            
            input_ids_chosen.append(tokens_chosen["input_ids"][0])
            attention_mask_chosen.append(tokens_chosen["attention_mask"][0])
            input_ids_rejected.append(tokens_rejected["input_ids"][0])
            attention_mask_rejected.append(tokens_rejected["attention_mask"][0])

        return {
            "input_ids_chosen": input_ids_chosen,
            "attention_mask_chosen": attention_mask_chosen,
            "input_ids_rejected": input_ids_rejected,
            "attention_mask_rejected": attention_mask_rejected
        }

    # Format the datasets
    formatted_train_dataset = train_rlhf_dataset.map(formatting_func, batched=True)
    formatted_test_dataset = test_rlhf_dataset.map(formatting_func, batched=True)

    # Combine the formatted datasets into a dictionary
    formatted_dataset = {
        "train": formatted_train_dataset,
        "test": formatted_test_dataset
    }

    # Print the first example from the formatted training dataset
    for i in range(1):
        print(f"Example {i+1}:")
        print("Input IDs (Chosen):", formatted_dataset["train"]["input_ids_chosen"][i])
        print("Attention Mask (Chosen):", formatted_dataset["train"]["attention_mask_chosen"][i])
        print("Input IDs (Rejected):", formatted_dataset["train"]["input_ids_rejected"][i])
        print("Attention Mask (Rejected):", formatted_dataset["train"]["attention_mask_rejected"][i])
        print()

    print("Number of training examples:", len(formatted_dataset["train"]))
    print("Number of testing examples:", len(formatted_dataset["test"]))

In [None]:
if reward_training and checkInitialAccuracy:
    # Define predict_response function
    def predict_response(model, tokenizer, input_ids, attention_mask):
        inputs = {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        return logits

    # Assuming formatted_dataset is already defined and split into train and test
    test_dataset = formatted_dataset['test']

    # Initialize variables to store predicted responses and ground truth labels
    predicted_responses = []
    ground_truth_labels = []

    # Loop through each example in the test dataset
    for i in range(len(test_dataset)):
        input_ids_chosen = torch.tensor(test_dataset['input_ids_chosen'][i])  # Convert list to tensor
        attention_mask_chosen = torch.tensor(test_dataset['attention_mask_chosen'][i])  # Convert list to tensor
        input_ids_rejected = torch.tensor(test_dataset['input_ids_rejected'][i])  # Convert list to tensor
        attention_mask_rejected = torch.tensor(test_dataset['attention_mask_rejected'][i])  # Convert list to tensor

        # Predict chosen response logits
        logits_chosen = predict_response(model, tokenizer, input_ids_chosen.unsqueeze(0), attention_mask_chosen.unsqueeze(0)).squeeze()
        # Predict rejected response logits
        logits_rejected = predict_response(model, tokenizer, input_ids_rejected.unsqueeze(0), attention_mask_rejected.unsqueeze(0)).squeeze()

        # Determine preferred response based on logits
        chosen_sum = logits_chosen.sum().item()
        rejected_sum = logits_rejected.sum().item()

        if chosen_sum > rejected_sum:
            predicted_response = 'chosen_response'
        else:
            predicted_response = 'rejected_response'

        # Append predicted response and ground truth label
        predicted_responses.append(predicted_response)
        ground_truth_label = 'chosen_response' if i % 2 == 0 else 'rejected_response'  # Example ground truth labeling
        ground_truth_labels.append(ground_truth_label)  # Replace with actual ground truth labels from your dataset

    # Calculate accuracy
    correct_predictions = sum(1 for pred, true in zip(predicted_responses, ground_truth_labels) if pred == true)
    total_examples = len(ground_truth_labels)
    accuracy = correct_predictions / total_examples * 100

    # Print the accuracy
    print(f"\nAccuracy: {accuracy:.2f}%")

In [None]:
# Configuring the training arguments
if reward_training:
    training_args = TrainingArguments(
        output_dir="./train-reward-training",
        evaluation_strategy="steps",
        logging_steps=50,
        per_device_train_batch_size=4,
        num_train_epochs = 8,
        learning_rate = 1e-5,
        optim='adamw_torch',
        overwrite_output_dir='True',
        push_to_hub=saveModel,
        fp16=True,
        report_to=None,
    )
    # Loading the RewardTrainer from TRL
    trainer = RewardTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=formatted_dataset["train"],
        eval_dataset=formatted_dataset["test"],
    )
    trainer.train()

In [None]:
if reward_training:
  trainer.evaluate()

In [None]:
if saveModel:
  model.save_pretrained("./reward_model")
  trainer.push_to_hub("Adzka/reward-model-distilbert-indo")

In [None]:
# Define predict_response function
def predict_response(model, tokenizer, input_ids, attention_mask):
    inputs = {
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    return logits

# -----

if score_ft_model_inferences:
    # Load the RLHF dataset
    dataset_path = "./inferences/ft_model_rl_data_for_PPO.csv"
    df = pd.read_csv(dataset_path)

    # Initialize variables to store predicted scores
    scores = []

    n_test = len(df)
    # Loop through each example in the test dataset
    for idx, row in df.iterrows():
        query = row['query']
        response = row['response']

        kwargs = {"padding": "max_length", "truncation": True, "max_length": 512, "return_tensors": "pt"}
        prompt_plus_response = query + "\n" + response

        tokens = tokenizer(prompt_plus_response, **kwargs)
        input_ids = tokens["input_ids"]
        attention_mask = tokens["attention_mask"]

        # Predict logits for the response
        logits = predict_response(model, tokenizer, input_ids, attention_mask)
        score = logits.sum().item()

        # Append predicted score
        scores.append(score)

        print(f"Processed example {idx+1} out of {n_test}")

    # Add 'score' column to dataframe
    df['score'] = scores

    # Save the updated dataframe to CSV
    output_csv_path = "./inferences/ft_model_rl_data_for_PPO_scored.csv"
    df.to_csv(output_csv_path, index=False)