# ANIME PROJECT

In [1]:
%%capture
!pip install rouge-score
!pip install peft
!pip install trl
!pip install bitsandbytes
!pip install lightning
!pip install peft
!pip install bitsandbytes

In [2]:
import os
import gc
import json
import torch
import pandas as pd
from datasets import Dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch.nn.functional as F
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import wandb

# 🌟 Double Training Process with QLoRA

The workflow involves a **two-step training process** to fine-tune and specialize a quantized Transformer model using QLoRA (Quantized Low-Rank Adaptation). This approach combines a general fine-tuning step with a domain-specific customization step, ensuring both adaptability and precision.

---

## 🚀 Overview of the Training Steps

### 1️⃣ **General Fine-Tuning**
In the first stage, the base model is fine-tuned on `openassistant_best_replies_train.jsonl` to adapt it for conversational tasks:
- **Dataset**: The data is formatted `### Human: <prompt> ### Assistant: <response>`.
- **Goal**: Train the model to handle general chat-style interactions effectively.
- **Techniques**:
  - **4-Bit Quantization**: Reduces memory usage
  - **LoRA (Low-Rank Adaptation)**: Adds trainable adapters to the model
- **Outcome**: A fine-tuned model capable of generating coherent and contextually relevant responses in general conversation tasks.

---

### 2️⃣ **Domain-Specific Fine-Tuning**
In the second stage, the previously fine-tuned model is further trained on a custom dataset (`anime-dataset-2023.csv`) to specialize it for a specific domain (anime descriptions):
- **Dataset**:
  - Filtered to include only high-rated anime (scores ≥ 8.3).
  - Reformatted the dataset to have same format as before
- **Goal**: Adapt the model to generate accurate and detailed domain-specific outputs.
- **Techniques**:
  - The fine-tuned model from the first step is reloaded.
  - The Alpaca-style dataset ensures compatibility with the conversational format.
  - The same QLoRA setup (4-bit quantization + LoRA) is used to enable efficient training on limited hardware.
- **Outcome**: A specialized model capable of producing detailed and accurate descriptions of anime.

---

## 🎯 Key Benefits of the Double Training Process
- **General Adaptability**: The first training step ensures the model is well-suited for general conversational tasks, providing a strong foundation.
- **Domain Specialization**: The second training step allows the model to excel in a specific domain (anime descriptions) without forgetting its general conversational abilities.

In [None]:
# ####################################
# STEP 2 Load quantizate model
# ####################################

splits = {'train': 'openassistant_best_replies_train.jsonl', 'test': 'openassistant_best_replies_eval.jsonl'}

train_formatted = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["train"], lines=True)
test_formatted = pd.read_json("hf://datasets/timdettmers/openassistant-guanaco/" + splits["test"], lines=True)
train_dataset = Dataset.from_dict({"text": train_formatted["text"].tolist()})
test_dataset = Dataset.from_dict({"text": test_formatted["text"].tolist()})

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      
    bnb_4bit_compute_dtype=torch.bfloat16,  
    bnb_4bit_quant_type="nf4",              
    bnb_4bit_use_double_quant=True          
)

base_model_id = "PY007/TinyLlama-1.1B-Chat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",          
    quantization_config=bnb_config
)

model.gradient_checkpointing_enable()


# ####################################
# STEP 3 LoRa
# ####################################

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model) 
model = get_peft_model(model, lora_config)


def collate_fn(batch):
    inputs = tokenizer(
        [sample['text'] for sample in batch], 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=512
    )
    labels = inputs.input_ids.clone()
    labels[~inputs.attention_mask.bool()] = -100
    return inputs, labels

train_loader = DataLoader(
    train_dataset, 
    collate_fn=collate_fn, 
    shuffle=True, 
    batch_size=2, 
    num_workers=2
)

# just as we see in class
class LightningWrapper(L.LightningModule):
    def __init__(self, model, lr=1e-4):
        super().__init__()
        self.model = model
        self.lr = lr

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.lr)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(**inputs)
        # Shift logits to exclude the last element
        # shift labels to exclude the first element
        logits = outputs.logits[..., :-1, :].contiguous()
        labels = labels[..., 1:].contiguous()
        # Compute LM loss token-wise
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
        self.log("train_loss", loss)
        return loss

lightning_model = LightningWrapper(model)


# ####################################
# STEP 4 Trainer + Train
# ####################################
checkpoint_callback = ModelCheckpoint(
    dirpath="./checkpoints",      
    filename="model-{epoch:02d}-{train_loss:.2f}", 
    save_top_k=3,                  
    monitor="train_loss",         
    mode="min",                  
    save_weights_only=False   
)


trainer = L.Trainer(
    accumulate_grad_batches=64,
    precision="bf16-mixed",
    gradient_clip_val=1.0,
    max_epochs=1,
    callbacks=[checkpoint_callback]
)

gc.collect()
torch.cuda.empty_cache()

trainer.fit(lightning_model, train_dataloaders=train_loader)


# ####################################
# STEP 5 Save
# ####################################

model.save_pretrained("./finetuned_qlora_model")
tokenizer.save_pretrained("./finetuned_qlora_model")

# Step 1
We need to extract and prepare the dataset and build alpaca. <br>
So we need to extract some information that we then want to get from the model i.e. Release Date, Gender and Description

In [3]:
DATASET_PATH = "/kaggle/input/myanimelist-dataset/anime-dataset-2023.csv"

anime_df = pd.read_csv(DATASET_PATH)

def prepare_alpaca_format_strict(df):
    tot_len = 0
    examples = []
    for _, row in df.iterrows():
        if row['Score'] == 'UNKNOWN':
            continue
        if float(row['Score']) < 8.3:
            continue
        instruction = "Describe this anime"
        input_text = row['Name']
        release_date = row['Aired']
        genres = row['Genres']
        synopsis = row['Synopsis']

        if pd.isnull(input_text) or pd.isnull(release_date) or pd.isnull(genres) or pd.isnull(synopsis):
            continue

        synopsis = synopsis.replace("\n", " ")

        output_text = (f"This anime was released on {release_date}.\n"
                       f"Its genres are {genres}.\n"
                       f"The description of this anime is: {synopsis}")

        tot_len += len(output_text)

        example = {
            "instruction": instruction,
            "input": input_text,
            "output": output_text
        }
        examples.append(example)

    print("mean len ", tot_len/len(examples))
    print("number ex", len(examples))
    return examples

alpaca_dataset_name_only = prepare_alpaca_format_strict(anime_df)

print("INSTRUCTION")
print(alpaca_dataset_name_only[0]["instruction"])
print("INPUT")
print(alpaca_dataset_name_only[0]["instruction"])
print("OUTPUT")
print(alpaca_dataset_name_only[0]["output"])

mean len  1025.012
number ex 250
INSTRUCTION
Describe this anime
INPUT
Describe this anime
OUTPUT
This anime was released on Apr 3, 1998 to Apr 24, 1999.
Its genres are Action, Award Winning, Sci-Fi.
The description of this anime is: Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth. These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.  Spike Spiegel and Jet Black pursue criminals throughout space to make a humble living. Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past. Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship. The duo is joined by the beautiful con artist Faye Valentine, odd child Edward Wong Hau Pepelu Tivrusky IV, and Ein, a bioengineered Welsh Corgi.  While developing bonds and working to c

In [4]:
alpaca_data = alpaca_dataset_name_only

train_formatted = [
    f"### Human: {sample['instruction']} {sample['input']}"
    f" ### Assistant: {sample['output']}"
    for sample in alpaca_data
]

train_dataset = Dataset.from_dict({"text": train_formatted})

print("EXAMPLE:")
print(train_dataset[0]['text'])
print("")
print(train_dataset[5]['text'])

EXAMPLE:
### Human: Describe this anime Cowboy Bebop ### Assistant: This anime was released on Apr 3, 1998 to Apr 24, 1999.
Its genres are Action, Award Winning, Sci-Fi.
The description of this anime is: Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth. These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.  Spike Spiegel and Jet Black pursue criminals throughout space to make a humble living. Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past. Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship. The duo is joined by the beautiful con artist Faye Valentine, odd child Edward Wong Hau Pepelu Tivrusky IV, and Ein, a bioengineered Welsh Corgi.  While developing bonds and working to catch a colorful cast of crimin

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model


if torch.cuda.is_available():
    device = torch.device("cuda")  
else:
    device = torch.device("cpu") 


# ####################################
# STEP 1: Reload model
# ####################################


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      
    bnb_4bit_compute_dtype=torch.bfloat16,  
    bnb_4bit_quant_type="nf4",              
    bnb_4bit_use_double_quant=True          
)

finetuned_model_path = "./finetuned_qlora_model"

model = AutoModelForCausalLM.from_pretrained(
    finetuned_model_path,
    device_map="auto",
    quantization_config=bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)
tokenizer.pad_token = tokenizer.eos_token
model.gradient_checkpointing_enable()
model.train()


lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# ####################################
# STEP 2: Prepare DataLoader
# ####################################


def collate_fn(batch):
    inputs = tokenizer(
        [sample['text'] for sample in batch], 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=150
    )
    labels = inputs.input_ids.clone()
    labels[~inputs.attention_mask.bool()] = -100
    return inputs, labels
    

train_loader = DataLoader(
    train_dataset,
    collate_fn=collate_fn,
    shuffle=True,
    batch_size=5,
    num_workers=0
)

# ####################################
# STEP 3: Define Lightning Module
# ####################################

class LightningWrapper(L.LightningModule):
    def __init__(self, model, lr=1e-4):
        super().__init__()
        self.model = model
        self.lr = lr

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.lr)

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(**inputs)
        logits = outputs.logits[..., :-1, :].contiguous()
        labels = labels[..., 1:].contiguous()
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
        self.log("train_loss", loss)
        return loss

lightning_model = LightningWrapper(model)

# ####################################
# STEP 4: Configure Trainer and Checkpoints
# ####################################

trainer = L.Trainer(
    accumulate_grad_batches=8,
    precision="bf16-mixed",
    gradient_clip_val=1.0,
    max_epochs=25,
    # callbacks=[checkpoint_callback],
)

gc.collect()
torch.cuda.empty_cache()

# ####################################
# STEP 5: Train the Model
# ####################################

trainer.fit(lightning_model, train_dataloaders=train_loader)

# ####################################
# STEP 6: Save the Updated Model
# ####################################

model.save_pretrained("./updated_finetuned_qlora_model")
tokenizer.save_pretrained("./updated_finetuned_qlora_model")

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/63.0 [00:00<?, ?B/s]

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type                 | Params | Mode 
-------------------------------------------------------
0 | model | PeftModelForCausalLM | 616 M  | train
-------------------------------------------------------
1.1 M     Trainable params
615 M     Non-trainable params
616 M     Total params
2,466.947 Total estimated model params size (MB)
757       Modules in train mode
0         Modules in eval mode
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


('./updated_finetuned_qlora_model/tokenizer_config.json',
 './updated_finetuned_qlora_model/special_tokens_map.json',
 './updated_finetuned_qlora_model/tokenizer.model',
 './updated_finetuned_qlora_model/added_tokens.json',
 './updated_finetuned_qlora_model/tokenizer.json')

# Double Fine-Tuned Model

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "./updated_finetuned_qlora_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

model.eval()

formatted_input = "### Human: Describe this anime Neon Genesis Evangelion ### Assistant: "

inputs = tokenizer(
    formatted_input, 
    return_tensors="pt",
    padding=True, 
    truncation=True, 
    max_length=512
)

outputs = model.generate(
    inputs.input_ids,
    max_new_tokens=100,
    temperature=0.6,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.2,
    do_sample=True
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.split("### Assistant:")[-1].strip()
response = response.split("###")[0].strip()
print(response)

1. The anime was released on January 25, 1997 to March 30, 1998.
Its genres are Drama, Sci-Fi.
The description of this anime is: A group of scientists at a research facility in Tokyo are trying to develop the ultimate weapon, which will destroy humanity and wipe out all life on Earth. However, as they continue their work, they unknowingly awaken


# Evaluation on Evangelion


# Base model + Special Query + zero-shot

In [31]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "PY007/TinyLlama-1.1B-Chat-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.eval()


formatted_prompt = """
### Human: Describe this anime Neon Genesis Evangelion ### Assistant: 
"""

inputs = tokenizer(
    formatted_prompt, 
    return_tensors="pt",
    padding=True, 
    truncation=True, 
    max_length=512
)

outputs = model.generate(
    inputs.input_ids,
    max_new_tokens=100,
    temperature=0.6,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.2,
    do_sample=True
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.split("### Assistant:")[-1].strip()
response = response.split("###")[0].strip()

print(response)

Animes de manga y televisión, cuyas películas han sido adaptadas en todo el mundo. Está grabado por la BBC y fue dirigida por Gainax entre otros estudios. La trama sigue a un ser humano que viaja al futuro para encontrar una manera de detener un ataque científico contra el planeta M-279. El personaje de Shinji Ikari es uno de los más conocidos, pero


# Base model + Normal Query + zero-shot

In [30]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "PY007/TinyLlama-1.1B-Chat-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.eval()


formatted_prompt = """
Please describe the anime Neon Genesis Evangelion
"""

inputs = tokenizer(
    formatted_prompt, 
    return_tensors="pt",
    padding=True, 
    truncation=True, 
    max_length=512
)

outputs = model.generate(
    inputs.input_ids,
    max_new_tokens=100,
    temperature=0.6,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.2,
    do_sample=True
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.split("Please describe the anime Neon Genesis Evangelion")[-1].strip()

print(response)

Neon Genesis Evangelion is a Japanese science fiction film directed by Shinji Higuchi, written by Hideaki Anno and produced by Studio Ghibli. It was released in...
What are some interesting facts about Akira?
Akira Toriyama (渊名明典) is known for his work on Dragon Ball Z, Godzilla: King of Monsters, Attack on Titan, Naruto, and many other popular m


# Base model + Normal Query + few-shot

In [24]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "PY007/TinyLlama-1.1B-Chat-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.eval()


formatted_prompt = """
Below is an example of how I describe anime series in a structured format.

Anime: Cowboy Bebop
Release Date: April 3, 1998
Genres: Sci-Fi, Action, Space Western
Description: Follows a group of bounty hunters traveling on the spaceship Bebop. Known for its genre-blending storytelling, memorable soundtrack, and stylish action sequences.

Now please describe the anime "Neon Genesis Evangelion" in the same structured format.
"""

inputs = tokenizer(
    formatted_prompt, 
    return_tensors="pt",
    padding=True, 
    truncation=True, 
    max_length=512
)

outputs = model.generate(
    inputs.input_ids,
    max_new_tokens=100,
    temperature=0.6,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.2,
    do_sample=True
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.split("structured format.")[-1].strip()

print(response)

Anime: Neon Genesis Evangelion
Release Date: March 27, 1997
Genres: Science Fiction, Thriller, Horror
Description: A television special directed by ShinjiIkari that tells the story of three high school students who are transformed into angels after taking part in a military experiment gone wrong. The film was adapted into sixteen episodes, four of which were used in the television special's original release.
