In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session  

/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1A_dev_test.tsv
/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1A_dev.tsv
/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1A_train.tsv


In [2]:
# প্রয়োজনীয় প্যাকেজ (Kaggle সেশনে চালাও)
!pip install -q transformers accelerate datasets peft bitsandbytes


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m75.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.

In [4]:
"""
qwen3_1.7b_lora_finetune_and_submit.py

Full pipeline using unsloth/Qwen3-1.7B:
- LoRA (PEFT) fine-tune on train/dev
- Save adapter
- Few-shot inference on dev_test
- Save submission TSV (id, label, model)

**WARNING**: Qwen3-1.7b is large. Ensure your GPU has enough memory (>=~48GB recommended).
If OOM: switch MODEL_NAME to a smaller model or reduce MAX_LEN/BATCH_SIZE/LORA_R.
"""

import os
import pandas as pd
import torch
from tqdm.auto import tqdm
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    GenerationConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel

# ---------------------------
# Config (edit if needed)
# ---------------------------
TRAIN_PATH = "/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1A_train.tsv"
DEV_PATH = "/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1A_dev.tsv"
DEV_TEST_PATH = "/kaggle/input/blp25-task1/blp25_hatespeech_subtask_1A_dev_test.tsv"

MODEL_NAME = "unsloth/Qwen3-1.7B"       # requested model
OUTPUT_DIR = "./qwen3_1.7b_lora_adapter"
OUTPUT_SUBMISSION = "submission_qwen3_1.7b_lora.tsv"

# Training hyperparams (tune to your hardware)
MAX_LEN = 256            # prompt + label max tokens
BATCH_SIZE = 1           # per-device batch size (1 recommended for large models)
EPOCHS = 2               # number of epochs (keep small)
LEARNING_RATE = 2e-4
LORA_R = 8               # reduce to 4 if OOM
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LABEL_LIST = ["Abusive", "Sexism", "Religious Hate", "Political Hate", "Profane", "None"]

print(f"Device: {DEVICE}  Model: {MODEL_NAME}")

# ---------------------------
# Helpers
# ---------------------------
def make_prompt_base(text):
    return (
        "Instruction: Classify the following Bangla YouTube comment into one of "
        f"{LABEL_LIST}.\n\nComment:\n{text}\n\nLabel:"
    )

def load_tsv(path):
    return pd.read_csv(path, sep="\t", quoting=3, engine="python")

# ---------------------------
# Load datasets
# ---------------------------
train_df = load_tsv(TRAIN_PATH)
dev_df = load_tsv(DEV_PATH)
dev_test_df = load_tsv(DEV_TEST_PATH)

print("Sizes -> train:", len(train_df), "dev:", len(dev_df), "dev_test:", len(dev_test_df))

# ---------------------------
# Tokenizer
# ---------------------------
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ---------------------------
# Encode for causal LM training
# ---------------------------
def encode_full(prompt_text, label_text, max_len=MAX_LEN):
    full = prompt_text + " " + label_text
    enc_full = tokenizer(full, truncation=True, max_length=max_len, padding="max_length")
    prompt_ids = tokenizer(prompt_text, truncation=True, max_length=max_len)["input_ids"]
    prompt_len = len(prompt_ids)
    labels = [-100] * prompt_len + enc_full["input_ids"][prompt_len:]
    labels = labels[:max_len] + [-100] * max(0, max_len - len(labels))
    return enc_full["input_ids"], enc_full["attention_mask"], labels

def build_examples(df, include_label=True):
    examples = []
    for _, row in df.iterrows():
        text = str(row["text"])
        prompt_text = make_prompt_base(text)
        if include_label:
            label_text = str(row["label"])
            input_ids, att_mask, labels = encode_full(prompt_text, label_text)
            examples.append({"input_ids": input_ids, "attention_mask": att_mask, "labels": labels})
        else:
            enc = tokenizer(prompt_text, truncation=True, max_length=MAX_LEN, padding="max_length")
            examples.append({"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"]})
    return examples

print("Encoding datasets (this can be slow)...")
train_examples = build_examples(train_df, include_label=True)
dev_examples = build_examples(dev_df, include_label=True)
dev_test_examples = build_examples(dev_test_df, include_label=False)

train_ds = Dataset.from_list(train_examples).with_format(type="torch", columns=["input_ids","attention_mask","labels"])
dev_ds = Dataset.from_list(dev_examples).with_format(type="torch", columns=["input_ids","attention_mask","labels"])
dev_test_ds = Dataset.from_list(dev_test_examples).with_format(type="torch", columns=["input_ids","attention_mask"])

# ---------------------------
# Load base model
# ---------------------------
print("Loading base model (attempt 8-bit to save memory)...")
base_model = None
try:
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        load_in_8bit=True,    # requires bitsandbytes
        device_map="auto"
    )
    print("Loaded base model in 8-bit.")
    try:
        base_model = prepare_model_for_kbit_training(base_model)
    except Exception:
        pass
except Exception as e:
    print("8-bit attempt failed:", e)
    print("Falling back to fp16 load with device_map auto.")
    base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")

# ---------------------------
# Apply LoRA (PEFT)
# ---------------------------
print("Applying LoRA adapter (PEFT)...")
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(base_model, lora_config)
print("LoRA applied. Trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))

# ---------------------------
# Trainer setup
# ---------------------------
def collate_fn(batch):
    import torch
    input_ids = torch.tensor([b["input_ids"] for b in batch], dtype=torch.long)
    attention_mask = torch.tensor([b["attention_mask"] for b in batch], dtype=torch.long)
    if "labels" in batch[0]:
        labels = torch.tensor([b["labels"] for b in batch], dtype=torch.long)
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
    else:
        return {"input_ids": input_ids, "attention_mask": attention_mask}

training_args = TrainingArguments(
    output_dir="./qwen3_1.7b_lora_training",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=torch.cuda.is_available(),
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    data_collator=collate_fn,
    tokenizer=tokenizer
)

# ---------------------------
# Train LoRA adapter
# ---------------------------
print("Starting LoRA fine-tuning (monitor for OOM)...")
try:
    trainer.train()
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print("Saved LoRA adapter to:", os.path.abspath(OUTPUT_DIR))
except Exception as e:
    print("Training failed or interrupted:", e)
    print("If OOM: try MAX_LEN=128, BATCH_SIZE=1, LORA_R=4 or use smaller MODEL_NAME.")
    # attempt to save adapter if possible
    try:
        model.save_pretrained(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)
        print("Adapter saved to:", os.path.abspath(OUTPUT_DIR))
    except Exception as e2:
        print("Could not save adapter:", e2)

# ---------------------------
# Few-shot examples (customize if desired)
# ---------------------------
few_shot_examples = [
    ("ইন্ডিয়া কি মাছ ধরা বন্ধ রাখছে এক নদীতে দুইনীতি কেমনে হয়", "Political Hate"),
    ("লক্ষ টাকা ঘুষ দিয়ে অযোগ্য আর দায়িত্বহীন মানসিকতার মানুষ গুলো সরকারি চাকরিতে কাজ করেন", "Abusive"),
    ("শালার ব্যাটা খুব বকা দিতে পারে ইচ্ছা করে না নির্লজ্জ", "Profane"),
    ("হামাস তুমি সারাজীবন ইসলামের দৃষ্টিতে থাকবে", "None"),
    ("মহিলা দিয়ে দেশ চালালে এর থেকে বেশি কি আশা করা যায়", "Sexism"),
    ("মুসলিম মুসলিম যুদ্ধ করে কেনো", "Religious Hate"),
]

def make_few_shot_prompt(text):
    prompt = (
        f"Instruction: Classify the following Bangla YouTube comment into one of {LABEL_LIST}.\n\n"
        "Here are some examples:\n\n"
    )
    for ex_text, ex_label in few_shot_examples:
        prompt += f"Comment: {ex_text}\nLabel: {ex_label}\n\n"
    prompt += f"Comment: {text}\nLabel:"
    return prompt

# ---------------------------
# Inference: load base + adapter and predict on dev_test
# ---------------------------
print("Loading base model for inference and attaching adapter...")
# load base model for inference (fp16)
base_for_infer = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
adapter_path = os.path.abspath(OUTPUT_DIR)
if not os.path.isdir(adapter_path):
    raise FileNotFoundError(f"Adapter path not found: {adapter_path}. Ensure training saved the adapter.")

model_peft = PeftModel.from_pretrained(base_for_infer, adapter_path)
model_peft.eval()

# tokenizer from adapter (so special tokens preserved)
tokenizer = AutoTokenizer.from_pretrained(adapter_path if os.path.isdir(adapter_path) else MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Generation config
gen_conf = GenerationConfig(max_new_tokens=12, do_sample=False) 

# Batch inference
print("Running few-shot inference on dev_test...")
preds = []
batch_size_inf = 4 if torch.cuda.is_available() else 1
for i in tqdm(range(0, len(dev_test_df), batch_size_inf)):
    batch_texts = dev_test_df["text"].iloc[i:i+batch_size_inf].tolist()
    batch_prompts = [make_few_shot_prompt(t) for t in batch_texts]
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LEN).to(model_peft.device)
    with torch.no_grad():
        outs = model_peft.generate(**inputs, generation_config=gen_conf)
    decs = tokenizer.batch_decode(outs, skip_special_tokens=True)
    for txt in decs:
        if "Label:" in txt:
            pred = txt.split("Label:")[-1].strip().split()[0]
        else:
            pred = txt.strip().split()[-1]
        if pred not in LABEL_LIST:
            pred = "None"
        preds.append(pred)

# pad if needed
while len(preds) < len(dev_test_df):
    preds.append("None")

# Save submission
submission_df = pd.DataFrame({"id": dev_test_df["id"], "label": preds, "model": "qwen3_1.7b_lora_fewshot"})
submission_df.to_csv(OUTPUT_SUBMISSION, sep="\t", index=False)
print("Saved submission to", OUTPUT_SUBMISSION)


Device: cuda  Model: unsloth/Qwen3-1.7B
Sizes -> train: 35522 dev: 2512 dev_test: 2512
Loading tokenizer...
Encoding datasets (this can be slow)...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading base model (attempt 8-bit to save memory)...


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Loaded base model in 8-bit.
Applying LoRA adapter (PEFT)...
LoRA applied. Trainable params: 1605632
Starting LoRA fine-tuning (monitor for OOM)...
Training failed or interrupted: only integer tensors of a single element can be converted to an index
If OOM: try MAX_LEN=128, BATCH_SIZE=1, LORA_R=4 or use smaller MODEL_NAME.
Adapter saved to: /kaggle/working/qwen3_1.7b_lora_adapter
Loading base model for inference and attaching adapter...
Running few-shot inference on dev_test...


  0%|          | 0/628 [00:00<?, ?it/s]

`generation_config` default values have been modified to match model-specific defaults: {'max_length': 40960, 'do_sample': True, 'temperature': 0.6, 'top_k': 20, 'top_p': 0.95, 'pad_token_id': 151654, 'bos_token_id': 151643, 'eos_token_id': [151645, 151643]}. If this is not desired, please set these values explicitly.


Saved submission to submission_qwen3_1.7b_lora.tsv
