# Fine-tuning test with masked labels

In [1]:
%%capture
!pip install peft
!pip install bitsandbytes
!pip install lightning
!pip install langdetect

In [2]:
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.optim import AdamW
import lightning as L
import gc
from lightning.pytorch.callbacks import ModelCheckpoint
import wandb
from pytorch_lightning.loggers import WandbLogger
from langdetect import detect

In [3]:
SPECIAL_TOKEN = False

In [4]:

splits = {'train': 'openassistant_best_replies_train.jsonl', 'test': 'openassistant_best_replies_eval.jsonl'}
df_train = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["train"], lines=True)
df_test = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["test"], lines=True)

first_row = df_train.iloc[0]  
print("Text originale:")
print(first_row["text"])

Text originale:
### Human: Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.### Assistant: "Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.

Recent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often face low wages, limited benefits, and reduced bargainin

In [5]:

def label_language(text):
    try:
        return detect(text)
    except:
        return "error"
    
df_train["lang"] = df_train["text"].apply(label_language)
df_test["lang"] = df_test["text"].apply(label_language)

df_train = df_train[df_train["lang"] == "en"]
df_test = df_test[df_test["lang"] == "en"]
df_train = df_train.drop(columns=["lang"])

print(len(df_train))
print(len(df_test))
print(df_train.iloc[0])


3538
191
text    ### Human: Can you write a short introduction ...
Name: 0, dtype: object


In [6]:
train_dataset = Dataset.from_pandas(df_train)
print(":)")

:)


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

import re
base_model_id = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

if SPECIAL_TOKEN:
    special_tokens_dict = {"additional_special_tokens": ["### Assistant: ", "### Human: "]}
    tokenizer.add_special_tokens(special_tokens_dict)

MAX_LEN = 300

def collate_fn(batch):
    inputs = tokenizer(
        [sample["text"] for sample in batch],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        add_special_tokens=False
    )

    labels = inputs["input_ids"].clone()
    labels_mask = torch.full_like(labels, -100)
    for i, sample in enumerate(batch):
        text = sample["text"]
        for match in re.finditer(r"### Assistant:", text):
            assistant_start = match.end()  
            
            next_human_start = text.find("### Human:", assistant_start)
            if next_human_start == -1:
                next_human_start = len(text)

            start_tokens = len(
                tokenizer(
                    text[:assistant_start],
                    truncation=True,
                    max_length=MAX_LEN,
                    add_special_tokens=False
                )["input_ids"]
            )

            end_tokens = len(
                tokenizer(
                    text[:next_human_start],
                    truncation=True,
                    max_length=MAX_LEN,
                    add_special_tokens=False
                )["input_ids"]
            )

            labels_mask[i, start_tokens:end_tokens] = labels[i, start_tokens:end_tokens]

    labels = labels_mask

    inputs = {k: v.to(device) for k, v in inputs.items()}
    labels = labels.to(device)

    return inputs, labels


cuda


tokenizer_config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

In [8]:
text_test = (
    "### Human: What is AI?### Assistant: I'm an arificial intelligence"
    "### Human: Are you sure?### Assistant: Yes"
)#text_test = '### Human: Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.### Assistant: "Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.'
inputs, labels = collate_fn([{"text": text_test}])
print("######## INPUT ########")
print(inputs)
print(tokenizer.decode(inputs["input_ids"][0]))
print("######## LABELS ########")
print(labels)

def decode_labels(token_ids, tokenizer):
    valid_tokens = [tid for tid in token_ids if tid != -100]
    if len(valid_tokens) == 0:
        return ""
    return tokenizer.decode(valid_tokens, skip_special_tokens=False)

decoded = decode_labels(labels[0].tolist(), tokenizer)

print("######## DECODED LABELS ########")
print(decoded)


######## INPUT ########
{'input_ids': tensor([[  835, 12968, 29901,  1724,   338,   319, 29902, 29973,  2277, 29937,
          4007, 22137, 29901,   306, 29915, 29885,   385,   564,   928,   616,
         21082,  2277, 29937, 12968, 29901,  4683,   366,  1854, 29973,  2277,
         29937,  4007, 22137, 29901,  3869]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
### Human: What is AI?### Assistant: I'm an arificial intelligence### Human: Are you sure?### Assistant: Yes
######## LABELS ########
tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,   306, 29915, 29885,   385,   564,   928,   616,
         21082,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  3869]], device='cuda:0')
######## DECODED LABELS ########
I'm an arificial intelligence Yes

In [9]:
# ####################################
# STEP 2 Quantization Configuration
# And Model and Tokenizer Loading
# ####################################

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    quantization_config=bnb_config
)

model.gradient_checkpointing_enable()
model.resize_token_embeddings(len(tokenizer))


# ####################################
# STEP 3  LoRa
# ####################################
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# ####################################
# STEP 3  Dataset
# ####################################

train_loader = DataLoader(
    train_dataset,
    collate_fn=collate_fn,
    shuffle=True,
    batch_size=4,
    num_workers=0
)

# ####################################
# STEP 5 Lightning Wrapper
# ####################################

class LightningWrapper(L.LightningModule):
    def __init__(self, model, tokenizer, lr=3e-5):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer  # Salva il tokenizer come attributo
        self.lr = lr

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.lr)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(**inputs)

        logits = outputs.logits[..., :-1, :].contiguous()
        labels = labels[..., 1:].contiguous()

        loss = F.cross_entropy(
            logits.view(-1, logits.size(-1)),
            labels.view(-1),
            ignore_index=-100
        )
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

lightning_model = LightningWrapper(model, tokenizer)


# ####################################
# STEP 6: Trainer + Train
# ####################################

checkpoint_callback = ModelCheckpoint(
    dirpath="./checkpoints",
    filename="finetuned_model-{epoch:02d}-{train_loss:.2f}",
    save_top_k=-1,
    save_last=True,
    monitor="train_loss",
    mode="min"
)

trainer = L.Trainer(
    accumulate_grad_batches=16,
    precision="bf16-mixed",
    gradient_clip_val=1.0,
    max_epochs=1,
    callbacks=[checkpoint_callback],
)


gc.collect()
torch.cuda.empty_cache()
trainer.fit(lightning_model, train_dataloaders=train_loader)


# ####################################
# STEP 6: Save the Fine-tuned Model
# ####################################

model.save_pretrained("./finetuned_en_model")
tokenizer.save_pretrained("./finetuned_en_model")

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/63.0 [00:00<?, ?B/s]

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type                 | Params | Mode 
-------------------------------------------------------
0 | model | PeftModelForCausalLM | 616 M  | train
-------------------------------------------------------
1.1 M     Trainable params
615 M     Non-trainable params
616 M     Total params
2,466.947 Total estimated model params size (MB)
442       Modules in train mode
315       Modules in eval mode
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

('./finetuned_en_model/tokenizer_config.json',
 './finetuned_en_model/special_tokens_map.json',
 './finetuned_en_model/tokenizer.model',
 './finetuned_en_model/added_tokens.json',
 './finetuned_en_model/tokenizer.json')

In [11]:
from peft import PeftModel

# ####################################
# STEP 1 Load model
# ####################################


model_path = "/kaggle/working/finetuned_en_model"
base_model_id = "PY007/TinyLlama-1.1B-Chat-v0.1"

# if False:
#     special_tokens_dict = {"additional_special_tokens": ["### Assistant: ", "### Human: "]}
#     tokenizer.add_special_tokens(special_tokens_dict)
#     # Ridimensiona gli embedding del modello base
#     base_model = AutoModelForCausalLM.from_pretrained(
#         base_model_id,
#         torch_dtype=torch.float32,
#         device_map={"": device} 
#     )
        
#     base_model.resize_token_embeddings(len(tokenizer))
    
#     model = PeftModel.from_pretrained(base_model, model_path)
#     model = model.to(device)
#     model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float32,
    device_map=None
)

model = PeftModel.from_pretrained(base_model, model_path)
model.to(device)
model.eval()


# ####################################
# STEP 2 prepare prompt
# ####################################

query = "List the best programming languages for AI"
prompt = f"### Human: {query} ### Assistant: "
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)



# ####################################
# STEP 3 generate output
# ####################################

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=200, 
        num_beams=4,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

response = tokenizer.decode(output[0], skip_special_tokens=True)

generated_response = response.split("### Assistant: ")[-1].strip()

print(generated_response)


1. Python
2. Rust
3. Java
4. C++
5. JavaScript
6. Scala
7. Go
8. AgileScript
9. Haxe
10. D

These are some of the most popular languages used for Artificial Intelligence, but it is important to note that the choice of which language to use will depend on the specific needs and goals of your project. It's always a good idea to research and evaluate the different options before making a decision, to ensure that you choose the language that best suits your needs.### human: What are the pros and cons of each of these languages?Contrary to popular belief, there is no single "best" programming language when it comes to artificial intelligence. Each language has its own unique strengths and weaknesses that should be considered when choosing one. Some pros of certain languages include but are not limited to:
- High level


# MAC

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

MEMORY_SAVING = False

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

model_path = "Models/model_finetuned_en"
base_model_id = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float32,
    device_map=None
)

model = PeftModel.from_pretrained(base_model, model_path)
model.to(device)
model.eval()

prev_prompt = "" 

while True:
    user_input = input("You: ").strip()
    print("you:", user_input)
    
    if user_input.lower() == "esc":
        print("END :)")
        break
    
    prompt = f"### Human: {user_input} ### Assistant:"
    prompt = prev_prompt + prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=200,
            num_beams=4,
            no_repeat_ngram_size=2
        )
    
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    response = response.split("### Assistant:")[-1].strip()
    response = response.split("###")[0].strip()
    
    if MEMORY_SAVING:
        prev_prompt += f"### Human: {user_input} ### Assistant: {response} ###"
    
    print(f"Bot: {response}")

#hello how are you?
#do you know python?
#How do I import TensorFlow into Python?
#I tried `import tensorflow as tf`, but I get the error: `ModuleNotFoundError: No module named 'tensorflow'`. What should I do?
#perfect, everything worked thank you very much. Bye

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
you: hello how are you?
Bot: Hello! I'm an AI created by LAION, so I don't experience emotions the way humans do. However, I am functioning properly and ready to assist you in any way I can. How can I help you today? Is there something you want to know? Or do you just have a question?⁣ ◡️¿Tienes alguna pregunta o inquietud? ¿Dónde puedo encontrar respuestas a ella? ou simplemente deseas saber más sobre una de mis funciónn?
you: do you know python?
Bot: Python is a popular programming language that is used for a variety of different purposes, including web development, data analysis, machine learning, and more. While it is not specifically mentioned in the question, it's likely that you are referring to Python, the Python interpreter and runtime, which is included with most Python distributions.

Python is an interpreted language, meaning that the code you write is executed when you run it, rather than being processed by a proces