In [None]:
%%capture
!pip install peft
!pip install bitsandbytes
!pip install lightning
!pip install langdetect

In [None]:
import os
os._exit(00)

In [4]:
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.optim import AdamW
import lightning as L
import gc
from lightning.pytorch.callbacks import ModelCheckpoint
import wandb
from pytorch_lightning.loggers import WandbLogger
from langdetect import detect


In [5]:

splits = {'train': 'openassistant_best_replies_train.jsonl', 'test': 'openassistant_best_replies_eval.jsonl'}
df_train = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["train"], lines=True)
df_test = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["test"], lines=True)

first_row = df_train.iloc[0]  
print("Text originale:")
print(first_row["text"])

Text originale:
### Human: Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.### Assistant: "Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.

Recent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often face low wages, limited benefits, and reduced bargainin

In [6]:
# remove non english text

def label_language(text):
    try:
        return detect(text)
    except:
        return "error"
    
df_train["lang"] = df_train["text"].apply(label_language)
df_test["lang"] = df_test["text"].apply(label_language)

df_train = df_train[df_train["lang"] == "en"]
df_test = df_test[df_test["lang"] == "en"]


In [7]:
# drop 'lang'
df_train = df_train.drop(columns=["lang"])
train_dataset = Dataset.from_pandas(df_train)


In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model_id = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

def collate_fn(batch):
    # Tokenizza il testo
    inputs = tokenizer(
        [sample['text'] for sample in batch], 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=400
    )
    # Clona input_ids per usarli come etichette
    labels = inputs.input_ids.clone()
    pad_token_id = tokenizer.pad_token_id

    # Applica maschera per ignorare la parte di istruzione (### Human: ...) e tener conto solo della risposta
    for i, sample in enumerate(batch):
        text = sample['text']
        # Trova l'indice di inizio della risposta
        response_start = text.find("### Assistant:")
        if response_start != -1:
            # Calcola la lunghezza in token fino alla risposta
            response_start_token_idx = tokenizer(
                text[:response_start], 
                truncation=True, 
                max_length=450, 
                return_tensors="pt"
            )["input_ids"].size(1)
            # Maschera tutto ciò che precede la risposta
            labels[i, :response_start_token_idx] = pad_token_id
    
    # Trasferisci i tensori sul dispositivo
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
    labels = labels.to(device)
    return inputs, labels


In [19]:
text_test = "### Human: What is AI? ### Assistant: Ai is me"
inputs, labels = collate_fn([{"text": text_test}])
print("######## INPUT ########")
print(inputs)
print(tokenizer.decode(inputs["input_ids"][0]))
print("######## LABELS ########")
print(labels)
decoded_labels = tokenizer.decode(
    [token for token in labels[0].tolist() if token != tokenizer.pad_token_id]
)
print("######## DECODED LABELS ########")
print(decoded_labels)


######## INPUT ########
{'input_ids': tensor([[    1,   835, 12968, 29901,  1724,   338,   319, 29902, 29973,   835,
          4007, 22137, 29901,   319, 29875,   338,   592]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
<s> ### Human: What is AI? ### Assistant: Ai is me
######## LABELS ########
tensor([[    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
          4007, 22137, 29901,   319, 29875,   338,   592]])
######## DECODED LABELS ########
Assistant: Ai is me


In [None]:

wandb.init(
    project="anime_Lama", 
    name="anime_lama_1",
    config={
        "learning_rate": 1e-4,
        "batch_size": 8,
        "epochs": 1
    }
)

# ####################################
# STEP 2 Quantization Configuration
# And Model and Tokenizer Loading
# ####################################

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    quantization_config=bnb_config
)

model.gradient_checkpointing_enable()


# ####################################
# STEP 3  LoRa
# ####################################
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# ####################################
# STEP 3  Dataser
# ####################################


train_loader = DataLoader(
    train_dataset,
    collate_fn=collate_fn,
    shuffle=True,
    batch_size=10,
    num_workers=0
)



# ####################################
# STEP 5 Lightning Wrapper
# ####################################

class LightningWrapper(L.LightningModule):
    def __init__(self, model, lr=1e-4):
        super().__init__()
        self.model = model
        self.lr = lr

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.lr)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(**inputs)

        # Shift logits and labels
        logits = outputs.logits[..., :-1, :].contiguous()
        labels = labels[..., 1:].contiguous()

        # Compute LM loss
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        # Log loss to wandb
        wandb.log({"train_loss": loss.item()})

        return loss

lightning_model = LightningWrapper(model)


# ####################################
# STEP 6: Trainer + Train
# ####################################

checkpoint_callback = ModelCheckpoint(
    dirpath="./checkpoints",
    filename="finetuned_model-{epoch:02d}-{train_loss:.2f}",
    save_top_k=-1,
    save_last=True,
    monitor="train_loss",
    mode="min"
)


wandb_logger = WandbLogger(
    project="finetuned_en_model", 
    log_model=True 
)

trainer = L.Trainer(
    logger=wandb_logger,  # Aggiungi il logger
    accumulate_grad_batches=8,
    precision="bf16-mixed",
    gradient_clip_val=1.0,
    max_epochs=1,
    callbacks=[checkpoint_callback],
)


gc.collect()
torch.cuda.empty_cache()
trainer.fit(lightning_model, train_dataloaders=train_loader)


# ####################################
# STEP 6: Save the Fine-tuned Model
# ####################################

model.save_pretrained("./finetuned_en_model")
tokenizer.save_pretrained("./finetuned_en_model")
wandb.finish()