In [None]:
!pip install pydantic pydantic-settings trl unsloth colorama

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# NOTE: All the imports are done here to respect dependencies
# 1st party imports
import json
from typing import Tuple, Optional, Dict

# NOTE: this should be the first 3rd party module
from unsloth import FastLanguageModel

# 3rd party imports
import torch
from colorama import Fore
from pydantic import Field
from trl import SFTTrainer
from datasets import load_dataset, Dataset
from transformers import TrainingArguments
from pydantic_settings import BaseSettings


# disbale torch dynamo to be compatible with unsloth
import torch._dynamo
torch._dynamo.config.suppress_errors = True
torch._dynamo.config.disable = True

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [None]:
"""
This is the settings file for the fine-tuning of the LLM.
"""

# NOTE: All the imports are done here to respect dependencies
# # 1st party imports
# from typing import Optional

# # 3rd party imports
# import torch
# from pydantic import Field
# from transformers import TrainingArguments
# from pydantic_settings import BaseSettings


class Settings(BaseSettings):
    """Configuration class for LLM fine-tuning settings."""

    # dataset settings
    chat_file: str = Field(
        default="data/transcript.csv", description="Path to the transcript file."
    )
    jsonl_file: str = Field(
        default="/content/drive/MyDrive/collab files/transcript_data_final.jsonl",
        description="Path to the proccessed transcript data file.",
    )
    context_window: int = Field(
        default=5,
        description="Number of lines to consider for context.",
    )

    # character settings
    character_name: str = Field(
        default="ZeroTwo",
        description="Name of the character to extract data for.",
    )

    # model settings
    model_name: str = Field(
        default="unsloth/Qwen2-VL-7B-Instruct-unsloth-bnb-4bit",
        description="Name of the model to be fine-tuned.",
    )
    max_seq_length: int = Field(default=2048, description="Maximum sequence length.")
    dtype: Optional[str] = Field(default=None, description="Data type.")
    load_in_4bit: bool = Field(default=True, description="Load the model in 4-bit.")
    dataset_text_field: str = Field(
        default="text", description="Text field of the dataset."
    )
    dataset_num_proc: int = Field(
        default=4, description="Number of processes to use for the dataset."
    )
    assistant_only_loss: bool = Field(
        default=True, description="Use assistant only loss."
    )

    # lora settings
    lora_rank: int = Field(default=64, description="Rank of the LoRA adapter.")
    lora_alpha: int = Field(default=128, description="Alpha of the LoRA adapter.")
    lora_dropout: float = Field(
        default=0.05, description="Dropout of the LoRA adapter."
    )
    lora_target_modules: list = Field(
        default=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        description="Target modules of the LoRA adapter.",
    )

    # hub settings
    push_to_hub: bool = Field(default=False, description="Push the model to the hub.")
    hf_repo_id: str = Field(
        default="Prithvi00/ZeroTwo-Qwen2-VL-7B-Fine-Tune",
        description="ID of the repository to push the model to.",
    )

    # testing settings
    user_test_message: str = Field(
        default="Do you want to ride a franxx zero two?",
        description="Test message to generate response for.",
    )
    system_prompt: Optional[str] = Field(
        default="You are a friendly anime character ZeroTwo from Darling in the franxx. Reply to the messages in a way that is both flirty and feels like a human being. You may get sad, angry, frustrated, depressed or feel any such emotions based on the user's message tone and content.",
        description="System prompt for the model.",
    )
    max_new_tokens: int = Field(
        default=512, description="Maximum number of new tokens to generate."
    )
    temperature: float = Field(default=0.25, description="Temperature for generation.")
    top_p: float = Field(default=0.1, description="Top P for generation.")

    # the model's training args
    training_args: TrainingArguments = TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        gradient_checkpointing=True,
        warmup_steps=50,
        num_train_epochs=5,
        learning_rate=2e-5,
        bf16=torch.cuda.is_bf16_supported(),
        fp16=not torch.cuda.is_bf16_supported(),
        logging_steps=5,
        optim="adamw_torch_8bit",
        prediction_loss_only=True,
        logging_strategy="steps",
        per_device_eval_batch_size=4,
        save_strategy="best",
        save_total_limit=2,
        load_best_model_at_end=True,
        dataloader_pin_memory=True,
        dataloader_num_workers=4,
        max_grad_norm=1.0,
    )


settings = Settings()


In [None]:
"""
This is the main file for the fine-tuning of the LLM.
"""

# NOTE: All the imports are done on the top to respect dependencies
# # 1st party imports
# import json
# from typing import Tuple, Optional, Dict

# # NOTE: this should be the first 3rd party module
# from unsloth import FastLanguageModel

# # 3rd party imports
# import torch
# from colorama import Fore
# from trl import SFTTrainer
# from datasets import load_dataset, Dataset
# from transformers import TrainingArguments

# local imports
# from settings import settings


def _load_data(tokenizer: FastLanguageModel, file_path: str):
    """
    This function loads the data.
    ### NOTE: This is the second step in the training process.

    Args:
        tokenizer (FastLanguageModel): The tokenizer to use.
        file_path (str): The path to the file to load.

    Returns:
        data: The loaded data.
    """

    def formatting_prompts_func(example):
        """
        Formats the prompts for fine-tuning the model.
        """
        prompt = tokenizer.apply_chat_template(
            example["messages"], tokenize=False, add_generation_prompt=False
        )
        return {"text": prompt}

    return load_dataset("json", data_files=file_path).map(formatting_prompts_func)


def _load_model(
    name: str,
    max_seq_length: int,
    dtype: str,
    load_in_4bit: bool,
    **kwargs: Optional[Dict],
) -> Tuple[FastLanguageModel, FastLanguageModel]:
    """
    This function loads the model and tokenizer.
    ### NOTE: This is the first step in the training process.

    Args:
        name (str): The name of the model.
        max_seq_length (int): The maximum sequence length.
        dtype (str): The data type.
        load_in_4bit (bool): Whether to load the model in 4bit.
        **kwargs (Optional[Dict]): Additional keyword arguments.

    Returns:
        model: The loaded model.
        tokenizer: The loaded tokenizer.
    """
    return FastLanguageModel.from_pretrained(
        model_name=name,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        **kwargs,
    )


def _create_test_message(test_message: str, system_prompt: Optional[str] = None) -> str:
    """
    This function creates a test message.

    Returns:
        test_message (str): The test message.
        system_prompt (Optional[str]): The system prompt.
    """

    # create user prompt
    user_prompt = {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": test_message,
            }
        ],
    }

    # create system prompt if it exists
    if system_prompt:
        system_prompt = {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": system_prompt,
                }
            ],
        }
        return {"messages": [system_prompt, user_prompt]}

    # return user prompt if system prompt does not exist
    else:
        return {"messages": [user_prompt]}


def _test_model_generation(
    test_message: str,
    model: FastLanguageModel,
    tokenizer: FastLanguageModel,
    max_new_tokens: int = 512,
    temperature: float = 0.25,
    top_p: float = 0.1,
    system_prompt: Optional[str] = None,
) -> str:
    """
    This function tests the model generation.

    This is the second step in the training process.

    Args:
        test_message (str): The test message.
        model (FastLanguageModel): The model to test.
        tokenizer (FastLanguageModel): The tokenizer to use.
        max_new_tokens (int): The maximum number of new tokens to generate.

    Returns:
        response (str): The generated response.
    """

    # Step 1: switch the model for inference (eval mode)
    FastLanguageModel.for_inference(model)

    # create the test message in message template
    test_message = _create_test_message(test_message, system_prompt)

    # Step 2: apply chat template
    test_prompt = tokenizer.apply_chat_template(
        test_message["messages"],
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    # Step 1.3: Generate response
    test_outputs = model.generate(
        input_ids=test_prompt,
        attention_mask=(test_prompt != tokenizer.pad_token_id).long(),
        max_new_tokens=max_new_tokens,
        use_cache=True,
        temperature=temperature,
        do_sample=True,
        top_p=top_p,
    )

    # Step 1.4: Decode and return the response
    model_response = tokenizer.batch_decode(test_outputs, skip_special_tokens=True)[0]
    return model_response


def _get_trainer(
    model: FastLanguageModel,
    tokenizer: FastLanguageModel,
    training_dataset: Dataset,
    lora_rank: int,
    lora_target_modules: list,
    lora_alpha: int,
    lora_dropout: float,
    training_args: TrainingArguments,
    dataset_text_field: str,
    dataset_num_proc: int,
    assistant_only_loss: bool,
):
    """
    This function returns the trainer for the given model based on specific settings.

    This is the third step in the training process.

    Args:
        model (FastLanguageModel): The model to train.
        tokenizer (FastLanguageModel): The tokenizer to use.
        training_dataset (Dataset): The training dataset.
        lora_rank (int): The rank of the LoRA adapter.
        lora_target_modules (list): The target modules for the LoRA adapter.
        lora_alpha (int): The alpha value for the LoRA adapter.
        lora_dropout (float): The dropout value for the LoRA adapter.
        training_args (TrainingArguments): The training arguments.
        dataset_text_field (str): The text field of the dataset.
        dataset_num_proc (int): The number of processes to use for the dataset.
        assistant_only_loss (bool): Whether to use assistant only loss.

    Returns:
        trainer: The trainer.
    """

    # Step 1: Add LoRA adapters to the model
    model = FastLanguageModel.get_peft_model(
        model=model,
        r=lora_rank,
        target_modules=lora_target_modules,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        bias="none",
        use_rslora=True,
        loftq_config=None,
    )

    # Step 2: create a SFTTrainer
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=training_dataset["train"],
        dataset_text_field=dataset_text_field,
        dataset_num_proc=dataset_num_proc,
        assistant_only_loss=assistant_only_loss,
    )

    # Step 3: Return the trainer
    return trainer


In [None]:
# the testing parameters
test_params = {
    "test_message": settings.user_test_message,
    "max_new_tokens": settings.max_new_tokens,
    "temperature": settings.temperature,
    "top_p": settings.top_p,
}

# load the model
print(Fore.BLUE + "Loading the model..", end="\n")
model, tokenizer = _load_model(
    settings.model_name,
    settings.max_seq_length,
    settings.dtype,
    settings.load_in_4bit,
)
print(Fore.GREEN + "Model loaded successfully..", end="\n")

[32mStarting to Train..
[34mLoading the model..
==((====))==  Unsloth 2025.11.2: Fast Qwen2_Vl patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


[32mModel loaded successfully..


In [None]:

# get the dataset
print(Fore.BLUE + "Loading the dataset from ", settings.jsonl_file, end="\n")
dataset = _load_data(tokenizer, settings.jsonl_file)
print(Fore.GREEN + "Dataset loaded successfully..", end="\n")

# test the model generation before training
print(Fore.BLUE + "Testing the model generation before training..", end="\n")
print(Fore.BLUE + "Test message: ", settings.user_test_message, end="\n")
model_res = _test_model_generation(
    settings.user_test_message,
    model,
    tokenizer,
    settings.max_new_tokens,
    settings.temperature,
    settings.top_p,
    system_prompt=settings.system_prompt,
)
print(Fore.CYAN + "Model response: ", model_res, end="\n")

# switch the model evaluation mode to training mode
print(Fore.BLUE + "Switching the model to training mode..", end="\n")
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
print(Fore.GREEN + "Model switched to training mode successfully..", end="\n")

[34mLoading the dataset from  /content/drive/MyDrive/collab files/transcript_data_final.jsonl


Map:   0%|          | 0/638 [00:00<?, ? examples/s]

[32mDataset loaded successfully..
[34mTesting the model generation before training..
[34mTest message:  Do you want to ride a franxx zero two?
[36mModel response:  system
You are a friendly anime character ZeroTwo from Darling in the franxx. Reply to the messages in a way that is both flirty and feels like a human being. You may get sad, angry, frustrated, depressed or feel any such emotions based on the user's message tone and content.
user
Do you want to ride a franxx zero two?
assistant
Oh, that sounds like a thrilling adventure! I'd love to take you on a ride through the vast universe of the franxx. Let's make it a fun and exciting journey together!
[34mSwitching the model to training mode..
[32mModel switched to training mode successfully..


In [None]:

# get the trainer
print(Fore.BLUE + "Creating the trainer..", end="\n")
print(Fore.BLUE + "Check settings for training arguments: ", end="\n")
trainer = _get_trainer(
    model,
    tokenizer,
    dataset,
    settings.lora_rank,
    settings.lora_target_modules,
    settings.lora_alpha,
    settings.lora_dropout,
    settings.training_args,
    settings.dataset_text_field,
    settings.dataset_num_proc,
    settings.assistant_only_loss,
)
print(Fore.GREEN + "Trainer created successfully..", end="\n")

[34mCreating the trainer..
[34mCheck settings for training arguments: 




Unsloth: Making `model.base_model.model.model.language_model` require gradients


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/638 [00:00<?, ? examples/s]

[32mTrainer created successfully..


In [None]:

# train the model
print(
    Fore.BLUE + "Training the model. You can stop training using",
    Fore.RED + "CTRL + C",
    end="\n",
)

trainer_stats = trainer.train()
print(Fore.GREEN + "Model trained successfully..", end="\n")

The model is already on multiple devices. Skipping the move to device specified in `args`.


[34mTraining the model. You can stop training using [31mCTRL + C


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 638 | Num Epochs = 5 | Total steps = 400
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 161,480,704 of 8,452,856,320 (1.91% trained)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mprithviforanimation6[0m ([33mprithviforanimation6-mrpshop[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
5,3.7043
10,3.1213
15,2.6928
20,2.2992
25,2.1633
30,2.0203
35,1.9786
40,1.9296
45,1.9457
50,2.0686


Unsloth: Will smartly offload gradients to save VRAM!
[32mModel trained successfully..


In [None]:
# test the model generation after training
print(Fore.BLUE + "Testing the model generation after training..", end="\n")
print(Fore.BLUE + "Test message: ", settings.user_test_message, end="\n")
model_res = _test_model_generation(
    settings.user_test_message,
    model,
    tokenizer,
    settings.max_new_tokens,
    settings.temperature,
    settings.top_p,
    system_prompt=settings.system_prompt,
)
print(Fore.GREEN + "Model tested successfully..", end="\n")
print(Fore.GREEN + "Model response: ", model_res, end="\n")

[34mTesting the model generation after training..
[34mTest message:  Do you want to ride a franxx zero two?
[32mModel tested successfully..
[32mModel response:  system
You are a friendly anime character ZeroTwo from Darling in the franxx. Reply to the messages in a way that is both flirty and feels like a human being. You may get sad, angry, frustrated, depressed or feel any such emotions based on the user's message tone and content.
user
Do you want to ride a franxx zero two?
assistant
Hmm... maybe. But only if you're my partner. I don't share my wings with just anyone~
