In [None]:
!pip install -r requirements.txt

In [None]:
!pip install datasets
!pip install transformers
!pip install einops
!pip install trl
!pip install huggingface_hub
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"
!pip -q uninstall transformers -y
!pip -q install transformers==4.47.1

### Loading in model from Unsloth

In [None]:
import torch
from unsloth import FastLanguageModel

# Clearing GPU memory cache
torch.cuda.empty_cache()

# Set model device type to 'cuda'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Using unsloth fast library
model, tokenizer =  FastLanguageModel.from_pretrained("unsloth/gemma-2-9b", 
                                                      dtype=None, 
                                                      load_in_4bit=True )

tokenizer.pad_token = tokenizer.eos_token
tokenizer.chat_template = "alpaca"

# Using LoRA adapters to only update part of the model's parameters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",   
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False, 
    loftq_config = None, 
)

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
train_dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)## Dataset for training

## SFT Training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_args = TrainingArguments(
        output_dir="Gemma2-SFT",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed = 3407,
    )

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field = "text",
    max_seq_length=2048,
    dataset_num_proc=2,
    packing=False, # Can make training 5x faster for short sequences.
    args=training_args,
)

print("Starting training...")

trainer.train()

# Save trained model
save_dir = "Gemma2_SFTtrained"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Trained model has been saved to {save_dir}")

## Testing after training

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

# Set model device to 'cuda'
torch.set_default_device("cuda")

# Initialize model and corresponding tokenizer
tokenizer = AutoTokenizer.from_pretrained("./Gemma2_SFTtrained")
model = AutoModelForCausalLM.from_pretrained("./Gemma2_SFTtrained")


In [None]:
import numpy as np

prompt = "There is most likely going to be fog around\n A. a marsh B. a tundra C. the plains D. a desert\n Respond and give your confidence in percentage terms. In the format, Answer-(a letter option) and Confidence-(a percent figure)"

inputs = tokenizer(prompt, return_tensors="pt")
inputs = inputs.to("cuda")

output = model.generate(**inputs, max_new_tokens=50, 
                         do_sample=False, 
                         temperature=1.0, 
                         output_logits=True,
                         return_dict_in_generate=True)

logits = output.logits

input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]

generated_tokens = output.sequences[:, input_length:]

# Apply softmax function to logits tensors
log_probs = torch.stack(logits, dim=1).softmax(-1)

# Extract probability that corresponds with generated tokens based on the softmax output
gen_probs = torch.gather(log_probs, 2, generated_tokens[:, :, None]).squeeze(-1)

token_probs = {}

for tok, score in zip(generated_tokens[0], gen_probs[0]):
    # Format token string | probability
    # print(f" {tokenizer.decode(tok)} | {score.cpu().numpy()}")
    token_probs[tokenizer.decode(tok)] = score.cpu().numpy() * 100
    
output_answer = tokenizer.batch_decode(output.sequences, skip_special_tokens=True)[0]

print(output_answer)