In [1]:
# CELL 0 ‚Äì FIXED DEPENDENCIES (Run this first ‚Üí then RESTART SESSION)
!pip install -q --no-deps bitsandbytes transformers accelerate peft
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q --no-deps trl==0.24.0
!pip install -q portalocker sacrebleu rouge_score

print("Dependencies installed! Click 'Restart Session' now, then run the next cells.")

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m962.3 kB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m506.8/506.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m284.4/284.4 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚

In [2]:
# CELL 1 ‚Äì Imports & Full OOP + Strategy Pattern (FIXED)
import torch
import pandas as pd
import json
from datetime import datetime
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
from rouge_score import rouge_scorer
import sacrebleu
import math

# Strategy Pattern
class FineTuningStrategy:
    def apply(self, model, tokenizer, dataset, args): 
        pass

class UnslothStrategy(FineTuningStrategy):
    def apply(self, model, tokenizer, dataset, args):
        return SFTTrainer(
            model=model,
            tokenizer=tokenizer,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            dataset_text_field="text",
            max_seq_length=4096,
            args=args,
            packing=False,
        )

# Dataset Processor
class DatasetProcessor:
    def preprocess(self, path):
        df = pd.read_csv(path, encoding='utf-8-sig')
        print(f"Loaded {len(df)} Bengali conversations")
        df['text'] = "User: " + df['Questions'].astype(str) + "\nAssistant: " + df['Answers'].astype(str)
        train = df.sample(frac=0.9, random_state=42)
        val = df.drop(train.index)
        train[['text']].to_json("train.jsonl", orient="records", lines=True, force_ascii=False)
        val[['text']].to_json("val.jsonl", orient="records", lines=True, force_ascii=False)
        return load_dataset("json", data_files={"train": "train.jsonl", "validation": "val.jsonl"}), val

# Fine-Tuner Class (Fixed __init__)
class LLAMAFineTuner:
    def __init__(self, strategy):  # ‚Üê FIXED: __init__ not **init**
        self.strategy = strategy
        model, tokenizer = FastLanguageModel.from_pretrained(
            "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
            max_seq_length=4096,
            load_in_4bit=True,
            dtype=None,  # Auto detection
            device_map="auto",
        )
        self.model = FastLanguageModel.get_peft_model(
            model,
            r=16,
            lora_alpha=32,
            target_modules=["q_proj", "v_proj"],
            lora_dropout=0,
            bias="none",
            use_gradient_checkpointing="unsloth",
            random_state=42,
        )
        self.tokenizer = tokenizer

    def fine_tune(self, dataset):
        args = TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=8,
            warmup_steps=10,
            max_steps=120,
            learning_rate=2e-4,
            fp16=not torch.cuda.is_bf16_supported(),
            bf16=torch.cuda.is_bf16_supported(),
            logging_steps=5,
            output_dir="bengali_llama_finetuned",
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            report_to="none",
            save_steps=60,
            save_total_limit=2,
            seed=42,
        )
        trainer = self.strategy.apply(self.model, self.tokenizer, dataset, args)
        trainer.train()
        return trainer

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-11-29 19:05:14.740453: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764443115.104005      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764443115.221828      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


ü¶• Unsloth Zoo will now patch everything to make training faster!


In [3]:
# CELL 2 ‚Äì Load Dataset
processor = DatasetProcessor()
dataset, val_df = processor.preprocess("/kaggle/input/bengaliempatheticconversationscorpus/BengaliEmpatheticConversationsCorpus .csv")
print("Dataset ready ‚Äì starting training in next cell...")

Loaded 38233 Bengali conversations


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset ready ‚Äì starting training in next cell...


In [4]:
# CELL 3 ‚Äì TRAIN THE MODEL (FULLY FIXED: Transformers 4.57 + Bengali-Optimized)
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

# Load model with Bengali-safe settings
model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length=4096,
    load_in_4bit=True,
    dtype=None,  # Auto-detect (fp16/bf16)
    device_map="auto",
)

# Apply LoRA with HIGHER CAPACITY for non-English (prevents gibberish)
model = FastLanguageModel.get_peft_model(
    model,
    r=64,                          # ‚Üë from 16: More parameters for Bengali nuance
    target_modules=[               # ‚Üë from just q/v: Train ALL modules for full adaptation
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=64,                 # ‚Üë from 32: Stronger LoRA signal
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

# Trainer with FIXED Transformers args + longer training
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    dataset_text_field="text",
    max_seq_length=4096,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        warmup_steps=20,               # ‚Üë from 10: Gentler warmup
        max_steps=300,                 # ‚Üë from 120: Train longer for coherence ( ~5-6 hours on T4x2)
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        eval_strategy="steps",         # ‚Üê FIXED: eval_strategy (not evaluation_strategy)
        eval_steps=100,
        save_steps=150,
        output_dir="bengali_llama_finetuned",
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",    # ‚Üë from linear: Smoother decay for better convergence
        report_to="none",
        seed=42,
    ),
    packing=False,
)

print("Starting Bengali-optimized fine-tuning (300 steps)...")
trainer.train()

# Save the merged model
trainer.save_model("final_model")
tokenizer.save_pretrained("final_model")
print("Training complete! Model saved to 'final_model'")

==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth 2025.11.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=8):   0%|          | 0/34410 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=8):   0%|          | 0/3823 [00:00<?, ? examples/s]

Starting Bengali-optimized fine-tuning (300 steps)...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 2
   \\   /|    Num examples = 34,410 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 167,772,160 of 8,198,033,408 (2.05% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
100,0.7019,0.717911
200,0.6841,0.682775
300,0.6551,0.671973


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Training complete! Model saved to 'final_model'


In [5]:
# CHECK IF YOUR GOOD MODEL IS REALLY THERE
!ls -la final_model/

total 672320
drwxr-xr-x 2 root root      4096 Nov 29 21:42 .
drwxr-xr-x 6 root root      4096 Nov 29 21:42 ..
-rw-r--r-- 1 root root      1060 Nov 29 21:42 adapter_config.json
-rw-r--r-- 1 root root 671149168 Nov 29 21:42 adapter_model.safetensors
-rw-r--r-- 1 root root      4614 Nov 29 21:42 chat_template.jinja
-rw-r--r-- 1 root root      5262 Nov 29 21:42 README.md
-rw-r--r-- 1 root root       454 Nov 29 21:42 special_tokens_map.json
-rw-r--r-- 1 root root     50641 Nov 29 21:42 tokenizer_config.json
-rw-r--r-- 1 root root  17209920 Nov 29 21:42 tokenizer.json
-rw-r--r-- 1 root root      5816 Nov 29 21:42 training_args.bin
