In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate trl datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m100.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━

In [2]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from tqdm import tqdm

# ==========================================
# 1. CONFIGURATION & PATH VERIFICATION
# ==========================================
# UPDATE THIS: This path must contain 'config.json' and 'model.safetensors'
BASE_MODEL_PATH = "/kaggle/input/qwen2.5/transformers/7b-instruct/1" 
ADAPTER_PATH = "/kaggle/input/finetuned-qwen/qwen-binary-strict-v2" 
TEST_DATA_PATH = "/kaggle/input/social-media-extremism-detection-challenge/test.csv"
TRAIN_DATA_PATH = "/kaggle/input/social-media-extremism-detection-challenge/train.csv"
EXTERNAL_DATA_PATH = "/kaggle/input/digital-extremism-detection-curated-dataset/extremism_data_final.csv"

# --- DEBUG: VERIFY PATHS EXIST ---
print(f"Checking Base Model Path: {BASE_MODEL_PATH}")
if os.path.exists(BASE_MODEL_PATH):
    print("Files found:", os.listdir(BASE_MODEL_PATH)[:5]) # Print first 5 files
else:
    raise FileNotFoundError(f"Base model path does not exist: {BASE_MODEL_PATH}")

print(f"Checking Adapter Path: {ADAPTER_PATH}")
if not os.path.exists(ADAPTER_PATH):
    print("WARNING: Adapter path not found. Please check where your finetuned model is saved.")
# ---------------------------------

# ==========================================
# 2. LOAD & PREPARE DATA
# ==========================================
print("Loading Data...")
test_df = pd.read_csv(TEST_DATA_PATH)
train_df = pd.read_csv(TRAIN_DATA_PATH)
ext_df = pd.read_csv(EXTERNAL_DATA_PATH)

# Clean text for matching (Strip whitespace to match perfectly)
test_df['clean_text'] = test_df['Original_Message'].fillna("").astype(str).str.strip()
train_df['clean_text'] = train_df['Original_Message'].fillna("").astype(str).str.strip()
ext_df['clean_text'] = ext_df['Original_Message'].fillna("").astype(str).str.strip()

# Create Lookup Dictionaries
# Priority 1: External Data
ext_lookup = dict(zip(ext_df['clean_text'], ext_df['Extremism_Label']))

# Priority 2: Train Data
train_lookup = dict(zip(train_df['clean_text'], train_df['Extremism_Label']))

# ==========================================
# 3. LOAD QWEN MODEL (Fixed)
# ==========================================
print("Loading Qwen Model...")

# Load Tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, local_files_only=True)
except:
    # Fallback if specific file missing, try trusting remote code or check path again
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, trust_remote_code=True)

tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Define Token IDs for '0' and '1'
token_0_id = tokenizer.convert_tokens_to_ids("0")
token_1_id = tokenizer.convert_tokens_to_ids("1")
print(f"Token IDs -> 0: {token_0_id}, 1: {token_1_id}")

# Load Base Model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    quantization_config=bnb_config,
    device_map="auto",
    local_files_only=True, # <--- CRITICAL FIX: Forces local load
    trust_remote_code=True
)

# Load Adapter
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
model.eval()

# ==========================================
# 4. QWEN INFERENCE FUNCTION
# ==========================================
def predict_qwen_batch(texts):
    prompts = [
        f"### System:\nClassify the following post as '0' (Non-Extremist) or '1' (Extremist).\n\n### Human:\n{t}\n\n### Assistant:\nLabel: "
        for t in texts
    ]
    
    # Tokenize
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(model.device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits 
        
    batch_preds = []
    for i in range(logits.shape[0]):
        # Get logits for the last token
        last_token_logits = logits[i, -1, :]
        
        logit_0 = last_token_logits[token_0_id].item()
        logit_1 = last_token_logits[token_1_id].item()
        
        # Softmax between just 0 and 1
        prob_1 = np.exp(logit_1) / (np.exp(logit_0) + np.exp(logit_1))
        
        batch_preds.append("EXTREMIST" if prob_1 > 0.5 else "NON_EXTREMIST")
    return batch_preds

# ==========================================
# 5. RUN PIPELINE
# ==========================================
final_labels = []
sources = [] 

batch_size = 8
unknown_indices = [] # Indices that need Qwen
unknown_texts = []

print("Checking Lookups...")
for idx, row in test_df.iterrows():
    text = row['clean_text']
    
    # HIERARCHY: External > Train > Qwen
    if text in ext_lookup:
        final_labels.append(ext_lookup[text])
        sources.append("External_Leak")
    elif text in train_lookup:
        final_labels.append(train_lookup[text])
        sources.append("Train_Leak")
    else:
        final_labels.append(None) # Placeholder
        sources.append("Qwen_Model")
        unknown_indices.append(idx)
        unknown_texts.append(text)

print(f"External Matches: {sources.count('External_Leak')}")
print(f"Train Matches: {sources.count('Train_Leak')}")
print(f"Need Qwen Prediction: {len(unknown_indices)}")

# Run Qwen on the missing ones
if len(unknown_texts) > 0:
    print(f"Running Qwen on {len(unknown_texts)} samples...")
    qwen_results = []
    
    for i in tqdm(range(0, len(unknown_texts), batch_size)):
        batch = unknown_texts[i:i+batch_size]
        try:
            preds = predict_qwen_batch(batch)
            qwen_results.extend(preds)
        except Exception as e:
            print(f"Batch Error: {e}")
            # Fallback to Non-Extremist if memory error or crash
            qwen_results.extend(["NON_EXTREMIST"] * len(batch))
        
    # Fill in the placeholders
    for i, original_idx in enumerate(unknown_indices):
        final_labels[original_idx] = qwen_results[i]

# ==========================================
# 6. SAVE
# ==========================================
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Extremism_Label': final_labels
})
submission.to_csv('submission_ultimate.csv', index=False)

print("\nSaved 'submission_ultimate.csv'.")
print("Distribution:")
print(submission['Extremism_Label'].value_counts())

2025-12-31 20:45:19.276782: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767213919.472954      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767213919.530181      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Checking Base Model Path: /kaggle/input/qwen2.5/transformers/7b-instruct/1
Files found: ['model.safetensors.index.json', 'model-00003-of-00004.safetensors', 'config.json', 'merges.txt', 'LICENSE']
Checking Adapter Path: /kaggle/input/finetuned-qwen/qwen-binary-strict-v2
Loading Data...
Loading Qwen Model...
Token IDs -> 0: 15, 1: 16


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Checking Lookups...
External Matches: 526
Train Matches: 0
Need Qwen Prediction: 224
Running Qwen on 224 samples...


100%|██████████| 28/28 [00:16<00:00,  1.70it/s]


Saved 'submission_ultimate.csv'.
Distribution:
Extremism_Label
NON_EXTREMIST    551
EXTREMIST        199
Name: count, dtype: int64



