# Imports and installation


In [1]:
%%capture
!pip install datasets transformers accelerate evaluate bleu bitsandbytes peft sentencepiece trl

In [2]:
import torch
from datasets import DatasetDict, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTTrainer
import pandas as pd

SEED = 999
BATCH_SIZE = 32
torch.manual_seed(SEED)

SHORT = False
RANDOMIZED_SHORT = True
MEDIUM = False

<torch._C.Generator at 0x7a63702765f0>

# Data Preparation

In [None]:
# Download the datasets
!wget -O /content/datasets.zip https://github.com/Tommaiberone/Zip-generation/raw/main/Datasets/datasets.zip
!unzip -o /content/datasets.zip

In [3]:
if SHORT:
  df = pd.read_csv('/content/mediumhex2hex.csv')
elif RANDOMIZED_SHORT:
  df = pd.read_csv('/content/randomized_shorthex2hex.csv')
elif MEDIUM:
  df = pd.read_csv('/content/shorthex2hex.csv')


df = df[:40960]

ds = Dataset.from_pandas(df)
ds = ds.remove_columns("text")
ds_train_test = ds.train_test_split(test_size=0.2, seed=SEED)
ds_test_dev = ds_train_test['test'].train_test_split(test_size=0.5, seed=SEED)
ds_splits = DatasetDict({
    'train': ds_train_test['train'],
    'valid': ds_test_dev['train'],
    'test': ds_test_dev['test']
})

ds_splits

DatasetDict({
    train: Dataset({
        features: ['text_hex', 'deflate_hex'],
        num_rows: 40000
    })
    valid: Dataset({
        features: ['text_hex', 'deflate_hex'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text_hex', 'deflate_hex'],
        num_rows: 5000
    })
})

In [4]:
ds_splits['train'][0]

{'text_hex': '4669727374206f6620616c6c2c2074686973',
 'deflate_hex': '789c73cb2c2a2e51c84f5348ccc9d15128c9c82c06003c54065b'}

# Model

In [5]:
model_name = "facebook/opt-350m"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=False,
    )
)
model.config.use_cache = False
model.config.pretraining_tp = 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [8]:
# Add LoRA adapters to model
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj","v_proj","o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

lora_model = get_peft_model(model, config)
lora_model.config.use_cache = False
print_trainable_parameters(lora_model)

trainable params: 2359296 || all params: 182036480 || trainable%: 1.2960567024807335


In [9]:
def formatting_func_with_response(example):
  return ("Take a breath and think step by step. Below is an input text that must be encoded in a novel way that has never been used before."
            "Write a response that appropriately completes the request.\n\n"
            "### Input:\n{text_hex}\n\n### Response:{deflate_hex}\n</s>").format_map(example)

In [10]:

batch_size=8  # batch size
gradient_accumulation_steps=1  # gradient acc. steps
num_train_epochs=3


training_args = TrainingArguments(
    output_dir="./output/",
    #report_to="wandb",  # this tells the Trainer to log the metrics to W&B
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    fp16=True,
    bf16=False,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio = 0.1,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    evaluation_strategy="epoch",
    num_train_epochs=num_train_epochs,
    # logging strategies
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="epoch", # saving is done at the end of each epoch
)


trainer = SFTTrainer(
    args=training_args,
    model=model,
    peft_config = config,
    train_dataset=ds_splits['train'],
    eval_dataset=ds_splits['valid'],
    formatting_func=formatting_func_with_response,
    tokenizer=tokenizer,
    max_seq_length=300,
    packing=True
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
trainer.train()

In [None]:
lora_model = lora_model.merge_and_unload()

# Test SFTTrainer

In [None]:
input = "5468697320697320616d617a696e672d6c6f6f6b696e67206d6f766965"
target = "789c0bc9c82c5600a2c4dcc4aaccbc74dd9cfcfc6c20ad909b5f96990a00a1320afc"

batch = tokenizer(f"Give me your best shot. Encode this in the way you learned: {input}\n ###Answer:", return_tensors='pt').to("cuda")

with torch.cuda.amp.autocast():
  output = model.generate(
      **batch,
      max_new_tokens=100,
      top_p=1.0,
      top_k=30,
      temperature=1.0,
      do_sample=True,
)

for seq in output:
  print(tokenizer.decode(seq, skip_special_tokens=True), "\n")

print(f"###Target:{target}")

N3q8ryccAAQsJ4nazQAAAAAAAAAUAAAAAAAAAERiyQHgAFgAU10AJ5mABjO28vX8oV2n3Q/j3+6ZzN3nF1h2Xh5bWwLs4x+f/Dt0H5x3k5sY4aZjL1FZ2vY1bxSZ3h6q7hv7cZj7XuY/aUe/qEZ/8uQN8hk/4r0h+Jj+AuXQQvKxwAA4AB6AGpdAACBMweuD9NPX71Au5RkHH7rYmP2uZeSKt2d+G+S4DsKQQEiO0sLnQe9Dc0f1+8hXZWJjXBhEtD8f6X2+gZJz8QX4f0fQQrSgVkK0Q/7bX9yvXnx2FpNj5dH1n4j+W/gFc3nW3/4mw1gPfM+AAAAFwZbAQlyAAcLAQ

N3q8ryccAAQ37taQ6gAAAAAAAAAUAAAAAAAAABYHCbbgAHUAcF0AJIgKRlMq7IiH+reoV8P4Xq3LEPcxvUdTlfDjtQywLESHvwD4mJC43UY+sxdTHXq9UmtgozbdddMLqBNz2FDwSZzZSrnNin1cIlgTnm+67wt1nX7bVWrx1IcYBuK65zwehttG0vR6AW/tBlPogFHbAADgAHoAal0AAIEzB64P1MWUwUIQ+iJr+bpMmobtlEle4I7GBUx8KuDBj+wHyaCP2r74gOhBgxyUH14lAuNpJD1Vjx07/1mudj+hkswkfAKgQRlUsMdZ5sWZnTc8vTgx1+3n8cvlr3u/CMXCOfyLwkQAAAAXBngBCXIABwsBAAEhIQEYDHsAAA==