# Imports and installation


In [None]:
%%capture
!pip install datasets transformers accelerate evaluate bleu bitsandbytes peft sentencepiece trl

In [None]:
import pandas as pd
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import BartTokenizer, BartForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, Seq2SeqTrainer
from torch.utils.data import DataLoader
from typing import Dict, List, Tuple
from dataclasses import dataclass
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTTrainer

SEED = 999
BATCH_SIZE = 32
torch.manual_seed(SEED)



<torch._C.Generator at 0x78088baef4f0>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preparation

In [None]:
df = pd.read_csv('/content/drive/MyDrive/kaggle/dataset_small.csv')

In [None]:
df = df[:8000]

In [None]:
ds = Dataset.from_pandas(df)
ds_train_test = ds.train_test_split(test_size=0.2, seed=SEED)
ds_test_dev = ds_train_test['test'].train_test_split(test_size=0.5, seed=SEED)
ds_splits = DatasetDict({
    'train': ds_train_test['train'],
    'valid': ds_test_dev['train'],
    'test': ds_test_dev['test']
})

ds_splits

DatasetDict({
    train: Dataset({
        features: ['text', 'zip_base64'],
        num_rows: 6400
    })
    valid: Dataset({
        features: ['text', 'zip_base64'],
        num_rows: 800
    })
    test: Dataset({
        features: ['text', 'zip_base64'],
        num_rows: 800
    })
})

In [None]:
ds_splits['train'][0]

{'text': "I just finished watching this movie. It wasn't ridiculously bad, but I'm really disappointed with it. I'm not really sure",
 'zip_base64': 'N3q8ryccAATQSj0e5AAAAAAAAAAUAAAAAAAAAJnCjKTgAHgAal0AJIgJR1PJ8O1H1PJcPUPnWHGVXKtkogJNo3RjgtLw7lwcRJ3+tKO2xjS3ExamC6F58x6owb6WpDXBGlImcP6AuNPgYgh2fzhYELdxOQrXHbrbAL78/7wHsOePCjXSLTMsMGMMqHU7gh9kAADgAHoAal0AAIEzB64P1HgnLUIQ+iJr+bpMmobtlEle4PwZs1yF4clDL2x9EzdYLOD2k9umhWcIQPxiti3EhgrrfKPrnhFhNjAo9SrYVnVm8mzf4MfkKK7X3UrLHkSQOOTCuEqC3h2DeR5XXgiasEJMAAAXBnIBCXIABwsBAAEhIQEYDHsAAA=='}

# Model

In [None]:
model_name = "facebook/opt-350m"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=False,
    )
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
# Add LoRA adapters to model
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=16,
    lora_alpha=16,
    #target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    target_modules=["q_proj", "k_proj","v_proj","o_proj"],
    lora_dropout=0.1,
    bias="none",
    #modules_to_save = ["lm_head", "embed_tokens"],        # needed because we added new tokens to tokenizer/model
    task_type="CAUSAL_LM"
)

lora_model = get_peft_model(model, config)
lora_model.config.use_cache = False
print_trainable_parameters(lora_model)

trainable params: 2359296 || all params: 182036480 || trainable%: 1.2960567024807335


In [None]:
def formatting_func_with_response(example):
  return ("Take a breath and think step by step. Below is an input text that must be encoded in a novel way that has never been used before."
            "Write a response that appropriately completes the request.\n\n"
            "### Input:\n{text}\n\n### Response:{zip_base64}\n</s>").format_map(example)

In [None]:

batch_size=8  # batch size
gradient_accumulation_steps=1  # gradient acc. steps
num_train_epochs=3


training_args = TrainingArguments(
    output_dir="./output/",
    #report_to="wandb",  # this tells the Trainer to log the metrics to W&B
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    fp16=True,
    bf16=False,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio = 0.1,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    evaluation_strategy="epoch",
    num_train_epochs=num_train_epochs,
    # logging strategies
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="epoch", # saving is done at the end of each epoch
)


trainer = SFTTrainer(
    args=training_args,
    model=model,
    peft_config = config,
    train_dataset=ds_splits['train'],
    eval_dataset=ds_splits['valid'],
    formatting_func=formatting_func_with_response,
    tokenizer=tokenizer,
    max_seq_length=300,
    packing=True
)

In [None]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
lora_model = lora_model.merge_and_unload()

In [None]:
lora_model.save_pretrained("/content/drive/MyDrive/DLProject/pretrained1")

# Test SFTTrainer

In [None]:
batch = tokenizer("### Encode this file in base64: Hello world\n ###Answer:", return_tensors='pt').to("cuda")

with torch.cuda.amp.autocast():
  output = model.generate(
      **batch,
      max_new_tokens=100,
      top_p=1.0,
      top_k=30,
      temperature=1.0,
      do_sample=True,
)

for seq in output:
  print(tokenizer.decode(seq, skip_special_tokens=True), "\n")

N3q8ryccAAQsJ4nazQAAAAAAAAAUAAAAAAAAAERiyQHgAFgAU10AJ5mABjO28vX8oV2n3Q/j3+6ZzN3nF1h2Xh5bWwLs4x+f/Dt0H5x3k5sY4aZjL1FZ2vY1bxSZ3h6q7hv7cZj7XuY/aUe/qEZ/8uQN8hk/4r0h+Jj+AuXQQvKxwAA4AB6AGpdAACBMweuD9NPX71Au5RkHH7rYmP2uZeSKt2d+G+S4DsKQQEiO0sLnQe9Dc0f1+8hXZWJjXBhEtD8f6X2+gZJz8QX4f0fQQrSgVkK0Q/7bX9yvXnx2FpNj5dH1n4j+W/gFc3nW3/4mw1gPfM+AAAAFwZbAQlyAAcLAQ

N3q8ryccAAQ37taQ6gAAAAAAAAAUAAAAAAAAABYHCbbgAHUAcF0AJIgKRlMq7IiH+reoV8P4Xq3LEPcxvUdTlfDjtQywLESHvwD4mJC43UY+sxdTHXq9UmtgozbdddMLqBNz2FDwSZzZSrnNin1cIlgTnm+67wt1nX7bVWrx1IcYBuK65zwehttG0vR6AW/tBlPogFHbAADgAHoAal0AAIEzB64P1MWUwUIQ+iJr+bpMmobtlEle4I7GBUx8KuDBj+wHyaCP2r74gOhBgxyUH14lAuNpJD1Vjx07/1mudj+hkswkfAKgQRlUsMdZ5sWZnTc8vTgx1+3n8cvlr3u/CMXCOfyLwkQAAAAXBngBCXIABwsBAAEhIQEYDHsAAA==