In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate trl datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m108.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.4/517.4 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m96.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import warnings

# --- FIX: DISABLE WANDB ---
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings('ignore')

# ==========================================
#        CONFIGURATION
# ==========================================
BASE_MODEL_PATH = "/kaggle/input/qwen2.5/transformers/7b-instruct/1" 
TRAIN_DATA_PATH = "/kaggle/input/social-media-extremism-detection-challenge/train.csv"
OUTPUT_DIR = "./qwen-binary-strict-v2"

# HYPERPARAMETERS
LORA_R = 32
LORA_ALPHA = 64
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4 
LEARNING_RATE = 2e-4
NUM_EPOCHS = 2
MAX_SEQ_LENGTH = 200 

# ==========================================
#           1. DATA PREPARATION
# ==========================================
train_df = pd.read_csv(TRAIN_DATA_PATH)
train_df['Original_Message'] = train_df['Original_Message'].fillna("").astype(str)

# Map labels to 0 and 1
label_map = {'NON_EXTREMIST': '0', 'NON-EXTREMIST': '0', 'EXTREMIST': '1'}
train_df['target'] = train_df['Extremism_Label'].str.upper().map(label_map)
train_dataset = Dataset.from_pandas(train_df[['Original_Message', 'target']])

# ==========================================
#           2. TOKENIZER SETUP
# ==========================================
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

# --- CRITICAL: IDENTIFY EXACT TOKEN IDS ---
token_0_id = tokenizer.convert_tokens_to_ids("0")
token_1_id = tokenizer.convert_tokens_to_ids("1")
print(f"Token ID for '0': {token_0_id}")
print(f"Token ID for '1': {token_1_id}")

# ==========================================
#      3. STRICT MASKING FUNCTION
# ==========================================
def format_and_strict_mask(example):
    prompt = f"### System:\nClassify the following post as '0' (Non-Extremist) or '1' (Extremist).\n\n### Human:\n{example['Original_Message']}\n\n### Assistant:\nLabel: "
    
    prompt_ids = tokenizer.encode(prompt, add_special_tokens=True)
    target_id = token_0_id if example['target'] == '0' else token_1_id
    
    input_ids = prompt_ids + [target_id] + [tokenizer.eos_token_id]
    labels = [-100] * len(prompt_ids) + [target_id] + [-100]
    attention_mask = [1] * len(input_ids)
    
    if len(input_ids) > MAX_SEQ_LENGTH:
        input_ids = input_ids[:MAX_SEQ_LENGTH]
        labels = labels[:MAX_SEQ_LENGTH]
        attention_mask = attention_mask[:MAX_SEQ_LENGTH]
    else:
        pad_len = MAX_SEQ_LENGTH - len(input_ids)
        input_ids += [tokenizer.pad_token_id] * pad_len
        labels += [-100] * pad_len
        attention_mask += [0] * pad_len
        
    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask
    }

tokenized_train = train_dataset.map(format_and_strict_mask, remove_columns=train_dataset.column_names)

# ==========================================
#           4. MODEL & TRAINING
# ==========================================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, quantization_config=bnb_config, device_map="auto")
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, LoraConfig(r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]))

def data_collator(features):
    batch = {}
    batch['input_ids'] = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)
    batch['attention_mask'] = torch.tensor([f['attention_mask'] for f in features], dtype=torch.long)
    batch['labels'] = torch.tensor([f['labels'] for f in features], dtype=torch.long)
    return batch

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=10,
        save_strategy="no",
        report_to="none"  # <--- THIS PREVENTS WANDB ERROR
    ),
    train_dataset=tokenized_train,
    data_collator=data_collator,
)

print("Starting STRICT Binary Training...")
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Training Complete. Proceed to Inference.")

2025-12-18 12:21:04.533170: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766060464.916037      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766060465.024534      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Token ID for '0': 15
Token ID for '1': 16


Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting STRICT Binary Training...


Step,Training Loss
10,1.2808
20,0.5101
30,0.4455
40,0.5442
50,0.7772
60,0.6312
70,0.43
80,0.5192
90,0.477
100,0.4581


Training Complete. Proceed to Inference.


In [3]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from tqdm import tqdm

# PATHS
BASE_MODEL_PATH = "/kaggle/input/qwen2.5/transformers/7b-instruct/1" 
ADAPTER_PATH = "./qwen-binary-strict-v2"
TEST_DATA_PATH = "/kaggle/input/social-media-extremism-detection-challenge/test.csv"

# LOAD
print("Loading Model...")
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_PATH)
tokenizer.padding_side = "left" 

# GET IDs AGAIN (Must match training)
token_0_id = tokenizer.convert_tokens_to_ids("0")
token_1_id = tokenizer.convert_tokens_to_ids("1")
print(f"Constraint IDs -> 0: {token_0_id}, 1: {token_1_id}")

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
    device_map="auto"
)
model = PeftModel.from_pretrained(model, ADAPTER_PATH)
model.eval()

# PREDICTION LOOP
test_df = pd.read_csv(TEST_DATA_PATH)
test_df['Original_Message'] = test_df['Original_Message'].fillna("").astype(str)

all_probs = []
BATCH_SIZE = 8

def get_constrained_probs(texts):
    # Same prompt structure as training
    prompts = [
        f"### System:\nClassify the following post as '0' (Non-Extremist) or '1' (Extremist).\n\n### Human:\n{t}\n\n### Assistant:\nLabel: "
        for t in texts
    ]
    
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=200).to(model.device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits 
        
    batch_probs = []
    for i in range(logits.shape[0]):
        # Grab logits of the last token
        last_token_logits = logits[i, -1, :]
        
        logit_0 = last_token_logits[token_0_id].item()
        logit_1 = last_token_logits[token_1_id].item()
        
        # Softmax over ONLY these two options
        prob_1 = np.exp(logit_1) / (np.exp(logit_0) + np.exp(logit_1))
        batch_probs.append(prob_1)
        
    return batch_probs

print("Running Constrained Inference...")
for i in tqdm(range(0, len(test_df), BATCH_SIZE)):
    batch = test_df['Original_Message'].tolist()[i:i+BATCH_SIZE]
    probs = get_constrained_probs(batch)
    all_probs.extend(probs)

# Since training data was 50/50, threshold is strictly 0.5
predictions = ["EXTREMIST" if p > 0.5 else "NON_EXTREMIST" for p in all_probs]

submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Extremism_Prob': all_probs,
    'Extremism_Label': predictions
})
submission[['ID', 'Extremism_Label']].to_csv('submission_strict_v2.csv', index=False)
print("Saved submission_strict_v2.csv")

Loading Model...
Constraint IDs -> 0: 15, 1: 16


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Running Constrained Inference...


100%|██████████| 94/94 [01:17<00:00,  1.21it/s]

Saved submission_strict_v2.csv



