Since our dataset serves more to help the model ‘work’ better, we decided to test our approach but on another dataset

# Run this for kaggle (missing library)
the reset of kernel is because some thimes we have problem with BitsAndBytesConfig 

In [1]:
%%capture
!pip install peft
!pip install bitsandbytes
!pip install lightning

In [None]:
import os
os._exit(00)

In [1]:
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.optim import AdamW
import lightning as L
import gc
from lightning.pytorch.callbacks import ModelCheckpoint
import wandb
from pytorch_lightning.loggers import WandbLogger


  from .autonotebook import tqdm as notebook_tqdm


# Step 1
We need to extract and prepare the dataset and build alpaca. <br>
So we need to extract some information that we then want to get from the model i.e. Release Date, Gender and Description

In [2]:

DATASET_PATH = "/kaggle/input/animedata/anime-dataset-2023.csv"



anime_df = pd.read_csv(DATASET_PATH)


def prepare_alpaca_format_strict(df):
    tot_len = 0
    examples = []
    for _, row in df.iterrows():
        instruction = "Describe this anime"
        input_text = row['Name']
        release_date = row['Aired']
        genres = row['Genres']
        synopsis = row['Synopsis']

        if pd.isnull(input_text) or pd.isnull(release_date) or pd.isnull(genres) or pd.isnull(synopsis):
            continue

        synopsis = synopsis.replace("\n", " ")

        output_text = (f"This anime was released on {release_date}.\n"
                       f"Its genres are {genres}.\n"
                       f"The description of this anime is: {synopsis}")

        tot_len += len(output_text)

        example = {
            "instruction": instruction,
            "input": input_text,
            "output": output_text
        }
        examples.append(example)

    print("mean len", tot_len/len(examples))
    return examples

alpaca_dataset_name_only = prepare_alpaca_format_strict(anime_df)
print(alpaca_dataset_name_only[0]["output"])

mean len 434.1848624774142
This anime was released on Apr 3, 1998 to Apr 24, 1999.
Its genres are Action, Award Winning, Sci-Fi.
The description of this anime is: Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth. These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.  Spike Spiegel and Jet Black pursue criminals throughout space to make a humble living. Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past. Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship. The duo is joined by the beautiful con artist Faye Valentine, odd child Edward Wong Hau Pepelu Tivrusky IV, and Ein, a bioengineered Welsh Corgi.  While developing bonds and working to catch a colorful cast of criminals, the Bebop crew's lives are disrupted

# Train
To train the model, we did QLoRa as for part 2, masked the prompt part in the label and trained the model to understand the structure of the response. 
<br><br>
After training the loss did not change any more so we guessed that the model is not able to learn the descriptions of the cores. This is due to the fact that 1 the model is relatively small and obviously has not been trained to understand the content of the cores. This is why we decided to carry out one more step. 

In [3]:


wandb.init(
    project="anime_Lama", 
    name="anime_lama_1",
    config={
        "learning_rate": 1e-4,
        "batch_size": 8,
        "epochs": 1
    }
)


alpaca_data = alpaca_dataset_name_only

if torch.cuda.is_available():
    device = torch.device("cuda")  # Imposta il dispositivo su CUDA
    print(f"Using CUDA device: {torch.cuda.get_device_name(device)}")
else:
    device = torch.device("cpu")  # Usa la CPU se CUDA non è disponibile
    print("CUDA not available, using CPU.")
    
train_formatted = [f"### Instruction:\n{sample['instruction']}\n\n### Input:\n{sample['input']}\n\n### Response:\n{sample['output']}" for sample in alpaca_data]
train_dataset = Dataset.from_dict({"text": train_formatted})

# ####################################
# STEP 2 Quantization Configuration
# And Model and Tokenizer Loading
# ####################################

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

base_model_id = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    quantization_config=bnb_config
)

model.gradient_checkpointing_enable()

# ####################################
# STEP 3  LoRa
# ####################################
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# ####################################
# STEP 4: DataLoader
# ####################################



def collate_fn(batch):
    # Tokenizza il testo
    inputs = tokenizer(
        [sample['text'] for sample in batch], 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=400
    )
    # Clona input_ids per usarli come etichette
    labels = inputs.input_ids.clone()
    
    # Applica maschera per ignorare la parte di istruzione e input
    for i, sample in enumerate(batch):
        text = sample['text']
        # Trova l'indice di inizio della risposta
        response_start = text.find("### Response:")
        if response_start != -1:
            response_start_token_idx = tokenizer(text[:response_start], truncation=True, max_length=450, return_tensors="pt")["input_ids"].size(1)
            labels[i, :response_start_token_idx] = -100  # Maschera l'istruzione e l'input
    
    # Trasferisci i tensori sul dispositivo
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
    labels = labels.to(device)
    return inputs, labels


train_loader = DataLoader(
    train_dataset,
    collate_fn=collate_fn,
    shuffle=True,
    batch_size=10,
    num_workers=0
)

# AT this point with an input 

# Cowboy Bebop
# ### Response:
# This anime was released on Apr 3, 1998 to Apr 24, 1999.
# Its genres are Action, Award Winning, Sci-Fi.
# The description of this ...

# We have a label that is

# </s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>### Response:
# This anime was released on Apr 3, 1998 to Apr 24, 1999.
# Its genres are Action, Award Winning, Sci-Fi.
# The description of this ...


# ####################################
# STEP 5 Lightning Wrapper
# ####################################

class LightningWrapper(L.LightningModule):
    def __init__(self, model, lr=1e-4):
        super().__init__()
        self.model = model
        self.lr = lr

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.lr)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(**inputs)

        # Shift logits and labels
        logits = outputs.logits[..., :-1, :].contiguous()
        labels = labels[..., 1:].contiguous()

        # Compute LM loss
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        # Log loss to wandb
        wandb.log({"train_loss": loss.item()})

        return loss

lightning_model = LightningWrapper(model)

# ####################################
# STEP 6: Trainer + Train
# ####################################

checkpoint_callback = ModelCheckpoint(
    dirpath="./checkpoints",
    filename="finetuned_model-{epoch:02d}-{train_loss:.2f}",
    save_top_k=-1,
    save_last=True,
    monitor="train_loss",
    mode="min"
)


# Configura il logger di wandb
wandb_logger = WandbLogger(
    project="nome_progetto",  # Nome del progetto wandb
    log_model=True  # Salva i checkpoint del modello su wandb
)

trainer = L.Trainer(
    logger=wandb_logger,  # Aggiungi il logger
    accumulate_grad_batches=8,
    precision="bf16-mixed",
    gradient_clip_val=1.0,
    max_epochs=1,
    callbacks=[checkpoint_callback],
)


gc.collect()
torch.cuda.empty_cache()
trainer.fit(lightning_model, train_dataloaders=train_loader)


# ####################################
# STEP 6: Save the Fine-tuned Model
# ####################################

model.save_pretrained("./finetuned_qlora_model")
tokenizer.save_pretrained("./finetuned_qlora_model")
wandb.finish()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Using CUDA device: Tesla P100-PCIE-16GB


tokenizer_config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/63.0 [00:00<?, ?B/s]

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type                 | Params | Mode 
-------------------------------------------------------
0 | model | PeftModelForCausalLM | 616 M  | train
-------------------------------------------------------
1.1 M     Trainable params
615 M     Non-trainable params
616 M     Total params
2,466.947 Total estimated model params size (MB)
442       Modules in train mode
315       Modules in eval mode
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLo

Training: |          | 0/? [00:00<?, ?it/s]

VBox(children=(Label(value='910.221 MB of 990.619 MB uploaded\r'), FloatProgress(value=0.9188401720308857, max…

0,1
epoch,▁▁▁▁▁▁▁
train_loss,█▅▅▄▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_epoch,▁
train_loss_step,▄▇▇▇█▁
trainer/global_step,▁▂▄▅▆██

0,1
epoch,0.0
train_loss,1.00491
train_loss_epoch,1.50064
train_loss_step,0.81289
trainer/global_step,311.0


# Train 2
We train the model with longer descriptions but only for famous anime, to see if the approach is able to teach something about anime

In [3]:

DATASET_PATH = "/kaggle/input/animedata/anime-dataset-2023.csv"
anime_df = pd.read_csv(DATASET_PATH)
anime_df = anime_df[anime_df['Score'] != '']
anime_df = anime_df[anime_df['Score'] != 'UNKNOWN']
anime_df = anime_df[anime_df['Score'].apply(lambda x: float(x) > 8.2)]
print(f"We use only a total of: {len(anime_df)} Anime to teach te model some information about anime")
alpaca_dataset_name_only = prepare_alpaca_format_strict(anime_df)


We use only a total of: 343 Anime to teach te model some information about anime
mean len 1009.1341107871721


In [4]:
wandb.init(
    project="anime_Lama", 
    name="anime_lama_2",  
    config={
        "learning_rate": 1e-4,
        "batch_size": 10,
        "epochs": 1
    }
)

# ####################################
# STEP 2: Configurazione del Dispositivo
# ####################################
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA device: {torch.cuda.get_device_name(device)}")
else:
    device = torch.device("cpu")
    print("CUDA not available, using CPU.")

# ####################################
# STEP 3: Caricamento Nuovo Dataset
# ####################################
alpaca_data = alpaca_dataset_name_only  # Nuovo dataset già pronto

train_formatted = [
    f"### Instruction:\n{sample['instruction']}\n\n### Input:\n{sample['input']}\n\n### Response:\n{sample['output']}"
    for sample in alpaca_data
]
train_dataset = Dataset.from_dict({"text": train_formatted})

# ####################################
# STEP 4: DataLoader
# ####################################
def collate_fn(batch):
    inputs = tokenizer(
        [sample['text'] for sample in batch], 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=800
    )
    labels = inputs.input_ids.clone()
    for i, sample in enumerate(batch):
        text = sample['text']
        response_start = text.find("### Response:")
        if response_start != -1:
            response_start_token_idx = tokenizer(
                text[:response_start], truncation=True, max_length=400, return_tensors="pt"
            )["input_ids"].size(1)
            labels[i, :response_start_token_idx] = -100
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
    labels = labels.to(device)
    return inputs, labels

train_loader = DataLoader(
    train_dataset,
    collate_fn=collate_fn,
    shuffle=True,
    batch_size=5,
    num_workers=0
)

# ####################################
# STEP 5: Caricamento del Checkpoint Manualmente
# ####################################
import torch

base_model_id = "PY007/TinyLlama-1.1B-Chat-v0.1"


tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Caricamento del modello e setup originale
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    quantization_config=bnb_config
)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Caricamento manuale dello stato dal checkpoint
checkpoint_path = "./checkpoints/last.ckpt"
checkpoint = torch.load(checkpoint_path, map_location="cuda")  # Carica il checkpoint
model.load_state_dict(checkpoint['state_dict'], strict=False)  # Carica lo stato del modello

# Wrappa il modello aggiornato in PyTorch Lightning
class LightningWrapper(L.LightningModule):
    def __init__(self, model, lr=1e-4):
        super().__init__()
        self.model = model
        self.lr = lr

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.lr)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(**inputs)
        logits = outputs.logits[..., :-1, :].contiguous()
        labels = labels[..., 1:].contiguous()
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        wandb.log({"train_loss": loss.item()})
        return loss

lightning_model = LightningWrapper(model)

# ####################################
# STEP 6: Trainer e Continuazione del Training
# ####################################
# checkpoint_callback = ModelCheckpoint(
#     dirpath="./checkpoints",
#     filename="finetuned_model-{epoch:02d}-{train_loss:.2f}",
#     save_top_k=-1,
#     save_last=True,
#     monitor="train_loss",
#     mode="min"
# )

wandb_logger = WandbLogger(
    project="anime_Lama", 
    name="anime_lama_continued", 
    log_model=True
)

trainer = L.Trainer(
    logger=wandb_logger,
    accumulate_grad_batches=8,
    precision="bf16-mixed",
    gradient_clip_val=1.0,
    max_epochs=40,
    # callbacks=[checkpoint_callback],
)

gc.collect()
torch.cuda.empty_cache()

# Continua il training dal checkpoint
trainer.fit(lightning_model, train_dataloaders=train_loader)

# ####################################
# STEP 7: Salvataggio Finale del Modello
# ####################################
model.save_pretrained("./finetuned_qlora_model_updated")
tokenizer.save_pretrained("./finetuned_qlora_model_updated")
wandb.finish()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Using CUDA device: Tesla P100-PCIE-16GB


tokenizer_config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/63.0 [00:00<?, ?B/s]

  checkpoint = torch.load(checkpoint_path, map_location="cuda")  # Carica il checkpoint
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type                 | Params | Mode 
-------------------------------------------------------
0 | model | PeftModelForCausalLM | 616 M  | train
-------------------------------------------------------
1.1 M     Trainable params
615 M     Non-trainable params
616 M     Total params
2,466.947 Total estimated model params size (MB)
442       Modules in train mode
315       Modules in eval mode
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consid

Training: |          | 0/? [00:00<?, ?it/s]

VBox(children=(Label(value='132.425 MB of 990.620 MB uploaded\r'), FloatProgress(value=0.13367852264420008, ma…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▅▃▃▂▂▃▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▁▂▂▂▂▁▂▂▂▂▂▁▂▁▁▂▁▁
train_loss_epoch,█▇▆▄▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▆▆▂█▁▁▄
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
epoch,39.0
train_loss,1.77236
train_loss_epoch,1.6586
train_loss_step,1.66346
trainer/global_step,359.0


# Testing
On the mac we can test the trained models and see how they perform and see if there are any improvements
<br>
Spoiler: yes, the trained model for the second time was able to understand more things although the description is still inaccurate. We saw that it occasionally recognises characters and themes but it would need much more training

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
torch.cuda.empty_cache() 
torch.mps.empty_cache() 

device = torch.device("mps") 

model_path = "Models/Anime_step_1"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

def generate_response(instruction, query, model, tokenizer, max_new_tokens=150, temperature=0.7, top_p=0.95):
    # Build prompt
    prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{query}\n\n### Response:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Output:
    # for this we use the sampling method this to have non deterministic results
    # we have (whit current parameter) more real result and less random result
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens, 
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,             
        temperature=temperature,   
        top_p=top_p,                
        repetition_penalty=1.2,     
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("### Response:")[-1].strip()

instruction = "Describe this anime"
query = "Neon Genesis Evangelion"

response = generate_response(instruction, query, model, tokenizer)
print("out:", response)

out: This anime was released on Apr 4, 1997 to Oct 25, 1997.
Its genres are Action, Sci-Fi.
The description of this anime is: In the year 2039, a group of scientists at Japan's Ibuki Research Center discover a mysterious device that can create sentient life in any form. The scientist who created it, Shinji Ikada, is given a mission by the government to destroy humanity and create a new world dominated by machines as their own country begins to fall apart. This mission will have dire consequences for those who oppose him, including his former students Kōsuke


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
torch.cuda.empty_cache() 
torch.mps.empty_cache() 

device = torch.device("mps") 

model_path = "Models/Anime_Model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

def generate_response(instruction, query, model, tokenizer, max_new_tokens=100, temperature=0.7, top_p=0.95):
    # Build prompt
    prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{query}\n\n### Response:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Output:
    # for this we use the sampling method this to have non deterministic results
    # we have (whit current parameter) more real result and less random result
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens, 
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,             
        temperature=temperature,   
        top_p=top_p,                
        repetition_penalty=1.2,     
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("### Response:")[-1].strip()

instruction = "Describe this anime"
query = "Neon Genesis Evangelion"

response = generate_response(instruction, query, model, tokenizer)
print("out:", response)


out: This anime was released on Jan 23, 1995 to Mar 07, 1996.
Its genres are Action, Adventure, Sci-Fi.
The description of this anime is: The young and powerful Shinji Ikari has been given a mission by the mysterious Rebuilding Council—to destroy humanity and eradicate all life in order to prevent mankind from becoming evil. With his brother G
