In [1]:
# https://github.com/huggingface/trl/blob/main/examples/scripts/reward_modeling.py
# https://huggingface.co/docs/trl/v0.7.10/en/reward_trainer

# https://www.youtube.com/watch?v=_2qiJXUc798
# https://colab.research.google.com/github/githubpradeep/notebooks/blob/main/Reward_Model_for_RLHF_%2B_trl.ipynb

In [1]:
import os
import json
import pandas as pd
from unsloth import FastLanguageModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
from datasets import Dataset

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [3]:
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3", 
    local_files_only = True, 
    max_seq_length = 2048, # Choose any! We auto support RoPE Scaling internally!
    dtype = None, # None for auto detection. Float16 "or Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True, # Use 4bit quantization to reduce memory usage. Can be False.
)

peft_model_gpt = PeftModel.from_pretrained(base_model, "../fullRun/lorasConfig1/checkpoint-862/") 

==((====))==  Unsloth 2024.9.post4: Fast Mistral patching. Transformers = 4.45.1.
   \\   /|    GPU: Quadro RTX 6000. Max memory: 23.462 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [4]:
# Function to load data from a folder
def load_data_from_folder(folder):
    data = []
    for filename in os.listdir(folder):
        if filename.endswith('.json'):  # Assuming JSON files
            file_path = os.path.join(folder, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                try:
                    file_data = json.load(f)
                    data.extend(file_data)  # Add all entries to the list
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in file {filename}")
    return data

# Create the dataset for the reward trainer
def create_reward_trainer_dataset(good_data_folder, bad_data_folder, output_csv_file):
    # Load good data from folder
    good_data = load_data_from_folder(good_data_folder)
    
    # Load bad data from folder
    bad_data = load_data_from_folder(bad_data_folder)
    
    # Prepare the data for the reward trainer
    chosen_data = [f"Input: {entry['input']} | Output: {entry['output']}" for entry in good_data]
    rejected_data = [f"Input: {entry['input']} | Output: {entry['output']}" for entry in bad_data]
    
    # Ensure both columns have the same length by padding with empty strings if necessary
    max_len = max(len(chosen_data), len(rejected_data))
    chosen_data += [''] * (max_len - len(chosen_data))
    rejected_data += [''] * (max_len - len(rejected_data))
    
    # Create a DataFrame
    df = pd.DataFrame({
        'chosen': chosen_data,
        'rejected': rejected_data
    })
    
    # Save the DataFrame to a CSV file
    df.to_csv(output_csv_file, index=False, encoding='utf-8')
    print(f"Dataset saved to {output_csv_file}")

In [5]:
good_data_folder = "./good_data/"
bad_data_folder = "./bad_data/"
output_csv_file = "reward_trainer_dataset.csv"

create_reward_trainer_dataset(good_data_folder, bad_data_folder, output_csv_file)

In [6]:
# Need a train_dataset with columns "chosen" and "rejected"
train_dataset = pd.read_csv('reward_trainer_dataset.csv', encoding='utf-8')
train_dataset = Dataset.from_pandas(train_dataset)
print(train_dataset)

Dataset({
    features: ['chosen', 'rejected'],
    num_rows: 22040
})


In [7]:
tokenizer.pad_token = tokenizer.eos_token

In [8]:
# changed version from:
# https://colab.research.google.com/github/githubpradeep/notebooks/blob/main/Reward_Model_for_RLHF_%2B_trl.ipynb
# preprocesses data into specific form for reward_trainer
def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        if isinstance(chosen, str) and isinstance(rejected, str):
            tokenized_j = tokenizer(chosen, truncation=True, max_length=512, padding="max_length")
            tokenized_k = tokenizer(rejected, truncation=True, max_length=512, padding="max_length")

            # Only add if tokens exist
            if tokenized_j["input_ids"] and tokenized_k["input_ids"]:
                new_examples["input_ids_chosen"].append(tokenized_j["input_ids"])
                new_examples["attention_mask_chosen"].append(tokenized_j["attention_mask"])
                new_examples["input_ids_rejected"].append(tokenized_k["input_ids"])
                new_examples["attention_mask_rejected"].append(tokenized_k["attention_mask"])
        else:
            # Handle missing cases by adding zeros instead of empty lists
            new_examples["input_ids_chosen"].append([0] * 512)
            new_examples["attention_mask_chosen"].append([0] * 512)
            new_examples["input_ids_rejected"].append([0] * 512)
            new_examples["attention_mask_rejected"].append([0] * 512)

    return new_examples

In [9]:
print(train_dataset)
print(train_dataset[501]["rejected"])

Dataset({
    features: ['chosen', 'rejected'],
    num_rows: 22040
})
Input: Was passiert mit dem Rückzahlungsanspruch für einen Pauschvergütungsvorschuss, wenn die Verjährung des Vergütungsanspruchs eingetreten ist? | Output: Was passiert mit dem Rückzahlungsanspruch für einen Pauschvergütungsvorschuss, wenn die Verjährung des Vergütungsanspruchs eingetreten ist?

Welche Ansprüche GmbH xxZ xy GbRH uG GmBv 201995 AnsprucheGmb H xNrG 02 AnstaltreuBundesG mbH  xY G bR H  u G m B v 4 AnstaltreuG m H y G aR U G mb H a4 TreuHandW xC HandW aG Handw xZ Hand W a G  aZ  Treug Hand w aCh Hand D Hand P Hand p Hand q Hand r Hand s Hand u Hand t Hand ü Hand v Hand y Hand z Hand ä Hand Ö Hand Ä Hand ö Hand Ø Hand ó HandÖ Hand × HandÜ Hand× Hand÷ Hand± Hand�� Hand¼ Hand½ Hand¾ Handß Hand¶ Hand· Hand§ Handà Handå Handù Handä Handø Handö Handõ Handō Handü Handüd Handý Handy Handž Handz Hand| Handwe Handwi Handwo Handx Handxy Handxx Handÿ Hand xzHand xə Handź Handż Hand Ž Hand ž Hand Ż Hand ż HandŽ Han

In [10]:
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
)
train_dataset = train_dataset.filter(
    lambda x: len(x["input_ids_chosen"]) <= 512
    and len(x["input_ids_rejected"]) <= 512
)

print(train_dataset)
print(type(train_dataset))

Map (num_proc=4):   0%|          | 0/22040 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22040 [00:00<?, ? examples/s]

Dataset({
    features: ['chosen', 'rejected', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
    num_rows: 22040
})
<class 'datasets.arrow_dataset.Dataset'>


In [11]:
import torch
torch.cuda.empty_cache()

# RewardTrainer benutzen, um unser Modell zu verbessern

## Model GPT

In [None]:
# have to transform the parameters in our lora adapters, so they can be changed by the reward trainer
# the code enables gradient tracking for all parameters
# but first they have to be floating-point type (like torch.float32), which is required for gradients to be computed

In [12]:
for param in peft_model_gpt.parameters():
    param.data = param.data.float()  # Convert to float if not already

In [13]:
for param in peft_model_gpt.parameters():
    param.requires_grad = True

In [None]:
for param in peft_model_gpt.parameters():
    print(param.requires_grad)

In [18]:
import os
import wandb

# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="sauLLM"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="checkpoint"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [19]:
from transformers import Trainer
from peft import LoraConfig
from trl import RewardTrainer, RewardConfig

# Step 1: Set up reward and LoRA configurations
reward_config = RewardConfig(
    output_dir="./model_gpt/train_logs",
    max_steps=100,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1.41e-5,
    optim="adamw_torch",
    save_steps=20,  # Save checkpoints during training to monitor, but we'll keep only the final model
    logging_steps=20,
    report_to="wandb",
    remove_unused_columns=False,
    max_length=512,
    gradient_checkpointing=True,
)

peft_config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False, # has to be false or else the reward trainer can't adjust the parameters
    r=16,
    lora_alpha=16,
    lora_dropout=0
)

# Step 2: Initialize RewardTrainer with your LoRA-enhanced model
reward_trainer = RewardTrainer(
    model=peft_model_gpt,  
    tokenizer=tokenizer,
    args=reward_config,
    train_dataset=train_dataset,
    peft_config=peft_config,
)

# Step 3: Train the model with reward feedback
reward_trainer.train()

# Step 4: Save only the improved model
reward_trainer.model.save_pretrained("./improved_peft_model_gpt")

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 22,040 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 3,800,305,664
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mm-schmerle[0m ([33mm-schmerle-universit-t-[0m). Use [1m`wandb login --relogin`[0m to force relogin


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
20,0.0
40,0.0
60,0.0
80,0.0
100,0.0


[34m[1mwandb[0m: Adding directory to artifact (./model_gpt/train_logs/checkpoint-20)... Done. 4.5s
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
[34m[1mwandb[0m: Adding directory to artifact (./model_gpt/train_logs/checkpoint-40)... Done. 4.6s
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
[34m[1mwandb[0m: Adding directory to artifact (./model_gpt/train_logs/checkpoint-60)... Done. 4.6s
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
[34m[1mwandb[0m: Adding directory to artifact (./model_gpt/train_logs/checkpoint-80)... Done. 4.6s
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
[34m[1mwandb[0m: Adding directory to artifact (./model_gpt/train_logs

In [2]:
import torch

# Model and Tokenizer Loading
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/mistral-7b-v0.3", 
    local_files_only=True, 
    max_seq_length=2048, 
    dtype=None, 
    load_in_4bit=True 
)

peft_model = PeftModel.from_pretrained(base_model, "../reinforcement_learning/improved_peft_model_gpt/") 

FastLanguageModel.for_inference(peft_model)
device = torch.device("cuda")

==((====))==  Unsloth 2024.9.post4: Fast Mistral patching. Transformers = 4.45.1.
   \\   /|    GPU: Quadro RTX 6000. Max memory: 23.462 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [3]:
input_text = "Was ist Mord?"

# Tokenize the input
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    max_length=tokenizer.model_max_length
).to(device)

# Generate output
with torch.no_grad():
    outputs = peft_model.generate(
        **inputs,
        max_length=2048,
        repetition_penalty=1.2,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

In [10]:
# Decode the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Was ist Mord?
