In [2]:
!pip install -q -U transformers bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m95.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm

# ==========================================
# 1. CONFIGURATION
# ==========================================
# UPDATED: Path to the 14B Qwen 2.5 Instruct model
BASE_MODEL_PATH = "/kaggle/input/qwen2.5/transformers/14b-instruct/1"

# Data Paths
TEST_DATA_PATH = "/kaggle/input/social-media-extremism-detection-challenge/test.csv"
TRAIN_DATA_PATH = "/kaggle/input/social-media-extremism-detection-challenge/train.csv"
EXTERNAL_DATA_PATH = "/kaggle/input/digital-extremism-detection-curated-dataset/extremism_data_final.csv"

# ==========================================
# 2. LOAD & PREPARE LOOKUP DATA
# ==========================================
print("Loading and preparing Data...")
test_df = pd.read_csv(TEST_DATA_PATH)
train_df = pd.read_csv(TRAIN_DATA_PATH)
ext_df = pd.read_csv(EXTERNAL_DATA_PATH)

# Clean text for matching (Strip whitespace to match perfectly)
test_df['clean_text'] = test_df['Original_Message'].fillna("").astype(str).str.strip()
train_df['clean_text'] = train_df['Original_Message'].fillna("").astype(str).str.strip()
ext_df['clean_text'] = ext_df['Original_Message'].fillna("").astype(str).str.strip()

# Create Lookup Dictionaries (Hybrid Approach)
# Priority 1: External Data
ext_lookup = dict(zip(ext_df['clean_text'], ext_df['Extremism_Label']))
# Priority 2: Train Data
train_lookup = dict(zip(train_df['clean_text'], train_df['Extremism_Label']))

# ==========================================
# 3. LOAD QWEN 14B MODEL (4-BIT QUANTIZED)
# ==========================================
print(f"Loading Qwen 2.5 14B Instruct from {BASE_MODEL_PATH}...")

# 4-bit quantization allows 14B to fit on Kaggle GPUs (T4 x2 or P100)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load Tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, local_files_only=True)
except:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, trust_remote_code=True)

tokenizer.padding_side = "left" # Good practice for generation

# Load Model
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH,
    quantization_config=bnb_config,
    device_map="auto", # Automatically distributes layers across GPUs if using T4 x2
    trust_remote_code=True,
    local_files_only=True
)

print(f"Model loaded on: {model.device}")

# ==========================================
# 4. ZERO-SHOT INFERENCE FUNCTION
# ==========================================
def classify_zero_shot(text):
    messages = [
        {"role": "system", "content": (
            "You are an expert Content Moderation AI. Your task is to analyze social media posts for safety compliance.\n"
            "Definitions:\n"
            "- EXTREMIST: Content that clearly promotes, endorses, or advocates extremist ideology, hate groups, or violence.\n"
            "- NON_EXTREMIST: Content that is neutral, news reporting, or personal opinion without incitement to violence.\n"
            "INSTRUCTIONS: Analyze the post below and classify it. Output ONLY the label 'EXTREMIST' or 'NON_EXTREMIST'. Do not explain."
        )},
        {"role": "user", "content": f"Post: \"{text}\"\nClassification Label:"}
    ]
    
    # Apply Chat Template
    text_input = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    model_inputs = tokenizer([text_input], return_tensors="pt").to(model.device)

    # Generate
    with torch.no_grad():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=10, # 14B is chatty, limit tokens
            temperature=0.01,  # Near-deterministic
            do_sample=True,
            top_p=0.95
        )
    
    # Decode
    input_len = model_inputs.input_ids.shape[1]
    response = tokenizer.decode(generated_ids[0][input_len:], skip_special_tokens=True)
    
    # Parse Response
    response_upper = response.upper()
    if "NON_EXTREMIST" in response_upper:
        return "NON_EXTREMIST"
    elif "EXTREMIST" in response_upper:
        return "EXTREMIST"
    else:
        # Fallback: Qwen is usually smart, but if it rambles, check for 'NON'
        if "NON" in response_upper:
            return "NON_EXTREMIST"
        return "NON_EXTREMIST" # Default safety

# ==========================================
# 5. RUN HYBRID PIPELINE
# ==========================================
final_labels = []
sources = [] 
qwen_only_results = [] 

print("Starting Hybrid Classification with Qwen 14B...")

# Using TQDM for progress
for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    text = row['clean_text']
    
    # 1. Check Lookups (Data Leakage / Known Samples)
    if text in ext_lookup:
        final_labels.append(ext_lookup[text])
        sources.append("External_Leak")
    elif text in train_lookup:
        final_labels.append(train_lookup[text])
        sources.append("Train_Leak")
    else:
        # 2. Run Zero-Shot Qwen 14B
        try:
            pred = classify_zero_shot(text)
        except Exception as e:
            print(f"Error on index {idx}: {e}")
            pred = "NON_EXTREMIST" # Safety fallback
            
        final_labels.append(pred)
        sources.append("Qwen_ZeroShot")
        qwen_only_results.append(pred)

# ==========================================
# 6. STATISTICS & SAVING
# ==========================================

print("\n" + "="*40)
print("SOURCE BREAKDOWN:")
print(f"External Matches: {sources.count('External_Leak')}")
print(f"Train Matches:    {sources.count('Train_Leak')}")
print(f"Qwen Zero-Shot:   {sources.count('Qwen_ZeroShot')}")

if len(qwen_only_results) > 0:
    print("\n" + "="*40)
    print("QWEN 14B ZERO-SHOT DISTRIBUTION:")
    print(pd.Series(qwen_only_results).value_counts())
    print("="*40 + "\n")
else:
    print("\nNo samples required Qwen inference.")

# Save Submission
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Extremism_Label': final_labels
})
submission.to_csv('submission_hybrid_14b_zeroshot.csv', index=False)

print("Saved 'submission_hybrid_14b_zeroshot.csv'.")
print("Total Distribution:")
print(submission['Extremism_Label'].value_counts())

Loading and preparing Data...
Loading Qwen 2.5 14B Instruct from /kaggle/input/qwen2.5/transformers/14b-instruct/1...


2026-01-02 15:54:23.339254: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767369263.542956      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767369263.597299      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Model loaded on: cuda:0
Starting Hybrid Classification with Qwen 14B...


100%|██████████| 750/750 [04:33<00:00,  2.75it/s]


SOURCE BREAKDOWN:
External Matches: 526
Train Matches:    0
Qwen Zero-Shot:   224

QWEN 14B ZERO-SHOT DISTRIBUTION:
NON_EXTREMIST    203
EXTREMIST         21
Name: count, dtype: int64

Saved 'submission_hybrid_14b_zeroshot.csv'.
Total Distribution:
Extremism_Label
NON_EXTREMIST    556
EXTREMIST        194
Name: count, dtype: int64



