In [1]:
!pip install -q -U bitsandbytes transformers peft datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m110.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m85.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install -q -U sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import warnings

# --- FIX: DISABLE WANDB ---
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings('ignore')

# ==========================================
#        CONFIGURATION
# ==========================================
BASE_MODEL_PATH = "/kaggle/input/qwen2.5/transformers/7b-instruct/1" 
TRAIN_DATA_PATH = "/kaggle/input/social-media-extremism-detection-challenge/train.csv"
OUTPUT_DIR = "./qwen-binary-strict-v2"

# HYPERPARAMETERS
LORA_R = 32
LORA_ALPHA = 64
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4 
LEARNING_RATE = 2e-4
NUM_EPOCHS = 2
MAX_SEQ_LENGTH = 200 

# ==========================================
#           1. DATA PREPARATION
# ==========================================
train_df = pd.read_csv(TRAIN_DATA_PATH)
train_df['Original_Message'] = train_df['Original_Message'].fillna("").astype(str)

# Map labels to 0 and 1
label_map = {'NON_EXTREMIST': '0', 'NON-EXTREMIST': '0', 'EXTREMIST': '1'}
train_df['target'] = train_df['Extremism_Label'].str.upper().map(label_map)
train_dataset = Dataset.from_pandas(train_df[['Original_Message', 'target']])

# ==========================================
#           2. TOKENIZER SETUP
# ==========================================
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

# --- CRITICAL: IDENTIFY EXACT TOKEN IDS ---
token_0_id = tokenizer.convert_tokens_to_ids("0")
token_1_id = tokenizer.convert_tokens_to_ids("1")
print(f"Token ID for '0': {token_0_id}")
print(f"Token ID for '1': {token_1_id}")

# ==========================================
#      3. STRICT MASKING FUNCTION (UPDATED)
# ==========================================
def format_and_strict_mask(example):
    # --- MODIFIED SECTION START ---
    # Use Qwen's native chat structure
    messages = [
        {"role": "system", "content": "Classify the following post as '0' (Non-Extremist) or '1' (Extremist)."},
        {"role": "user", "content": example['Original_Message']}
    ]
    
    # apply_chat_template handles the special tokens (<|im_start|>, <|im_end|>, etc.)
    # add_generation_prompt=True adds the start of the assistant turn (<|im_start|>assistant\n)
    prompt_ids = tokenizer.apply_chat_template(
        messages, 
        tokenize=True, 
        add_generation_prompt=True
    )
    # --- MODIFIED SECTION END ---

    target_id = token_0_id if example['target'] == '0' else token_1_id
    
    input_ids = prompt_ids + [target_id] + [tokenizer.eos_token_id]
    labels = [-100] * len(prompt_ids) + [target_id] + [-100]
    attention_mask = [1] * len(input_ids)
    
    if len(input_ids) > MAX_SEQ_LENGTH:
        input_ids = input_ids[:MAX_SEQ_LENGTH]
        labels = labels[:MAX_SEQ_LENGTH]
        attention_mask = attention_mask[:MAX_SEQ_LENGTH]
    else:
        pad_len = MAX_SEQ_LENGTH - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * pad_len
        labels += [-100] * pad_len
        attention_mask += [0] * pad_len
        
    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask
    }

tokenized_train = train_dataset.map(format_and_strict_mask, remove_columns=train_dataset.column_names)

# ==========================================
#           4. MODEL & TRAINING
# ==========================================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, quantization_config=bnb_config, device_map="auto")
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, LoraConfig(r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]))

def data_collator(features):
    batch = {}
    batch['input_ids'] = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)
    batch['attention_mask'] = torch.tensor([f['attention_mask'] for f in features], dtype=torch.long)
    batch['labels'] = torch.tensor([f['labels'] for f in features], dtype=torch.long)
    return batch

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=10,
        save_strategy="no",
        report_to="none"  # <--- THIS PREVENTS WANDB ERROR
    ),
    train_dataset=tokenized_train,
    data_collator=data_collator,
)

print("Starting STRICT Binary Training...")
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Training Complete. Proceed to Inference.")

2026-01-01 19:06:48.662044: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767294408.870516      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767294408.927331      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Token ID for '0': 15
Token ID for '1': 16


Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting STRICT Binary Training...


Step,Training Loss
10,2.4966
20,0.5963
30,0.4717
40,0.5474
50,0.5123
60,0.5387
70,0.3396
80,0.6805
90,0.5609
100,0.5561


Training Complete. Proceed to Inference.
