In [None]:
! pip install -q -U bitsandbytes peft==0.8.2 trl==0.7.10 accelerate==0.27.1 datasets  transformers==4.38.0

In [None]:
import os
import torch
import transformers
from google.colab import userdata
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GemmaTokenizer
)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

In [None]:
model_id = "google/gemma-2b"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ["HF_TOKEN"])
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map = {"" : 0},  token=os.environ["HF_TOKEN"])

In [None]:
text = "I need python regex for remove links in a text? this regex match only links"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)
print(inputs)

In [None]:
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
os.environ["WANDB_DISABLED"] = "false"

In [None]:
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type = "CASUAL_LM"
)

In [None]:
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

In [None]:
def formatting_func(example):
  text = f"Quote : {example['quote']}\nAuthor : {example['author']}"
  return [text]

In [None]:
trainer = SFTTrainer(
    model = model,
    train_dataset = data["train"],
    args = transformers.TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 2,
        max_steps = 100,
        learning_rate = 2e-4,
        fp16 = True,
        logging_steps = 1,
        output_dir = "outputs",
        optim = "paged_adamw_8bit"
    ),
    peft_config = lora_config,
    formatting_func = formatting_func,
)

In [None]:
trainer.train()

In [None]:
text = "Quote : Be yourself; everyone else is already taken."
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)
print(inputs)

In [None]:
outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
text = "Quote : A room without books is like a body without a soul"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)
print(inputs)

In [None]:
outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))