In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig, AutoModel
from sklearn.preprocessing import StandardScaler
import os
import re
import sys
from tqdm.autonotebook import tqdm

# Handle safetensors import
try:
    from safetensors.torch import load_file as safe_load_file
except ImportError:
    print("Warning: 'safetensors' library not found. Will attempt to use torch.load if .bin file exists.")

In [None]:
# ============================================================
# 1.CONFIGURATION 
# ============================================================
class Config: 
    try:
        BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        BASE_DIR = os.getcwd()
        if "Subtask2a" not in BASE_DIR and os.path.exists(os.path.join(BASE_DIR, "Subtask2a")):
            BASE_DIR = os.path.join(BASE_DIR, "Subtask2a")

    print(f"Working Directory: {BASE_DIR}")
    
    WEIGHTS_DIR = os.path.join(BASE_DIR, "weights")
    DATA_DIR = os.path.join(BASE_DIR, "data")
    
    # File Output
    OUTPUT_FILE = os.path.join(BASE_DIR, "submission.csv")

    # --- MODEL PARAMETERS ---
    base_model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    window_size = 8
    max_seq_length = 512
    batch_size = 32
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # MoE Config
    num_experts = 4

Working Directory: /Users/theson/Documents/SEMEVAL2026FT/Subtask2a


In [None]:
# ============================================================
# 2. MODEL DEFINITION 
# ============================================================
class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(in_dim, in_dim),
            nn.LayerNorm(in_dim),
            nn.GELU(),
            nn.Linear(in_dim, 1)
        )
    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state) 
        w = w.squeeze(-1).masked_fill(attention_mask == 0, -1e4)
        weights = torch.softmax(w, dim=1).unsqueeze(-1)
        context_vector = torch.sum(weights * last_hidden_state, dim=1)
        return context_vector

class Expert(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_prob=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x): return self.net(x)

class SparseMoELayer(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_experts=4, top_k=2):
        super().__init__()
        self.num_experts = num_experts
        self.top_k = top_k
        self.gate = nn.Linear(input_dim, num_experts)
        self.experts = nn.ModuleList([Expert(input_dim, hidden_dim, output_dim) for _ in range(num_experts)])

    def forward(self, x):
        gate_logits = self.gate(x)
        gate_probs = F.softmax(gate_logits, dim=-1)
        topk_weights, topk_indices = torch.topk(gate_probs, self.top_k, dim=-1)
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
        all_expert_outputs = torch.stack([exp(x) for exp in self.experts], dim=1)
        
        batch_size = x.size(0)
        final_output = torch.zeros(batch_size, 1, device=x.device)
        for k in range(self.top_k):
            idx = topk_indices[:, k].view(-1, 1, 1).expand(-1, 1, all_expert_outputs.size(-1))
            val = all_expert_outputs.gather(1, idx).squeeze(1)
            final_output += topk_weights[:, k].unsqueeze(1) * val
        return final_output

class Subtask2aModel(nn.Module):
    def __init__(self, model_name, num_experts=4, top_k=2):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.backbone = AutoModel.from_pretrained(model_name)
        dim = self.config.hidden_size
        self.pooler = AttentionPooling(dim)
        self.valence_moe = SparseMoELayer(dim + 2, 256, 1, num_experts, top_k)
        self.arousal_moe = SparseMoELayer(dim + 2, 256, 1, num_experts, top_k)

    def forward(self, input_ids, attention_mask, numerical_features):
        outputs = self.backbone(input_ids, attention_mask)
        text_emb = self.pooler(outputs.last_hidden_state, attention_mask)
        combined = torch.cat((text_emb, numerical_features), dim=1)
        return self.valence_moe(combined), self.arousal_moe(combined)

# ============================================================
# 3. DATA PROCESSING
# ============================================================
def fix_spacing(text):
    text = str(text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\s+'\s+", "'", text)
    text = re.sub(r"\s+\.", ".", text)
    return text.strip()

def process_dataframe_for_inference(df, is_train=False):
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)
    
    df['text'] = df['text'].apply(fix_spacing)
    processed_data = []
    
    has_forecasting_marker = 'is_forecasting_user' in df.columns
    
    for uid, group in df.groupby('user_id'):
        texts = group['text'].values
        curr_v = group['valence'].values
        curr_a = group['arousal'].values
        
        if is_train:
            for i in range(len(texts)):
                processed_data.append({'numerical_features': [curr_v[i], curr_a[i]]})
        else:
            target_indices = []
            if has_forecasting_marker:
                is_true = group['is_forecasting_user'].values
                if 'state_change_valence' in group.columns:
                    is_nan = np.isnan(group['state_change_valence'].values)
                    target_indices = np.where(is_true & is_nan)[0].tolist()
                    if not target_indices: 
                         target_indices = np.where(is_true)[0].tolist()
                else:
                    true_indices = np.where(is_true)[0]
                    if len(true_indices) > 0:
                        target_indices = [true_indices[-1]]
            else:
                target_indices = [len(texts) - 1]

            for idx in target_indices:
                window_texts = []
                for k in range(Config.window_size - 1, -1, -1):
                    i = idx - k
                    if i >= 0: window_texts.append(str(texts[i]))
                full_input = " </s> ".join(window_texts)
                
                processed_data.append({
                    'user_id': uid, 
                    'input_text': full_input,
                    'numerical_features': [curr_v[idx], curr_a[idx]]
                })
            
    return pd.DataFrame(processed_data)

class InferenceDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = df['input_text'].values
        self.nums = np.array(df['numerical_features'].tolist(), dtype=np.float32)
        self.tokenizer = tokenizer
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=Config.max_seq_length, return_tensors="pt")
        return {
            "input_ids" : enc['input_ids'].flatten(),
            'attention_mask': enc['attention_mask'].flatten(),
            'numerical_features': torch.tensor(self.nums[idx], dtype=torch.float)
        }

In [None]:
# ============================================================
# 4. MAIN INFERENCE (AUTO-DETECT NESTED FOLDER)
# ============================================================
def predict():
    print("="*60)
    print("STARTING INFERENCE SUBTASK 2A")
    print(f"Working Directory: {Config.BASE_DIR}")
    print("="*60)

    actual_weights_path = Config.WEIGHTS_DIR
    if os.path.exists(Config.WEIGHTS_DIR):
        files_in_weights = os.listdir(Config.WEIGHTS_DIR)
        has_model_file = any(f.endswith(".bin") or f.endswith(".safetensors") for f in files_in_weights)
        
        if not has_model_file:
            subfolders = [f for f in files_in_weights if os.path.isdir(os.path.join(Config.WEIGHTS_DIR, f))]
            if subfolders:
                print(f" Detected nested folder. Going into: {subfolders[0]}")
                actual_weights_path = os.path.join(Config.WEIGHTS_DIR, subfolders[0])
            else:
                print(" ERROR: Weights folder exists but contains no model files or subfolders!")
                return

    print(f"Target Weights Path: {actual_weights_path}")

    # --- [STEP 1] LOAD DATA & FIT SCALER ---
    print(">>> [1/5] Looking for Data Files...")
    if not os.path.exists(Config.DATA_DIR):
        print(f" ERROR: 'data' folder missing at {Config.DATA_DIR}"); return

    all_files = os.listdir(Config.DATA_DIR)
    train_files = [f for f in all_files if "train" in f.lower()]
    test_files = [f for f in all_files if "forecasting" in f.lower() or "test" in f.lower()]
    test_files = [f for f in test_files if f not in train_files]

    if not train_files:
        print("ERROR: No training CSV found in 'data/' folder! (Required to fit Scaler)")
        return
    if not test_files:
        print("ERROR: No test/forecasting CSV found in 'data/' folder!"); return

    train_path = os.path.join(Config.DATA_DIR, train_files[0])
    test_path = os.path.join(Config.DATA_DIR, test_files[0])
    
    print(f"Found Train File: {train_files[0]}")
    print(f"Found Test File:  {test_files[0]}")

    print(">>> [2/5] Fitting Scaler...")
    df_train = pd.read_csv(train_path)
    train_proc = process_dataframe_for_inference(df_train, is_train=True)
    
    scaler = StandardScaler()
    scaler.fit(np.array(train_proc['numerical_features'].tolist()))
    print("âœ… Scaler fitted successfully!")

    print(">>> [3/5] Processing Test Data...")
    df_test = pd.read_csv(test_path)
    test_proc = process_dataframe_for_inference(df_test, is_train=False)
    test_nums_scaled = scaler.transform(np.array(test_proc['numerical_features'].tolist()))
    test_proc['numerical_features'] = test_nums_scaled.tolist()

    # --- [STEP 2] LOAD MODEL ---
    print(">>> [4/5] Loading Model...")
    
    tokenizer = None
    try:
        print(f"   Attempting to load tokenizer from: {actual_weights_path}")
        tokenizer = AutoTokenizer.from_pretrained(actual_weights_path, local_files_only=True, use_fast=False)
        print(" Loaded tokenizer from local weights (Slow mode).")
    except Exception as e:
        print(f"Local tokenizer failed: {e}")
        print("Downloading base tokenizer from HuggingFace...")
        tokenizer = AutoTokenizer.from_pretrained(Config.base_model_name, use_fast=False)

    model = Subtask2aModel(Config.base_model_name, num_experts=Config.num_experts, top_k=Config.top_k)
    
    w_files = [f for f in os.listdir(actual_weights_path) if f.endswith('.safetensors') or f.endswith('.bin')]
    if not w_files:
        print(f"ERROR: No weights found in {actual_weights_path}"); return

    w_path = os.path.join(actual_weights_path, w_files[0])
    print(f"Loading weights from: {w_files[0]}")

    if w_path.endswith(".safetensors"):
        state_dict = safe_load_file(w_path)
        model.load_state_dict(state_dict, strict=False)
    else:
        state_dict = torch.load(w_path, map_location="cpu")
        model.load_state_dict(state_dict, strict=False)

    model.to(Config.device)
    model.eval()

    # --- [STEP 3] PREDICT ---
    print(">>> [5/5] Running Inference...")
    test_ds = InferenceDataset(test_proc, tokenizer)
    test_loader = DataLoader(test_ds, batch_size=Config.batch_size, shuffle=False)

    val_preds, aro_preds = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(Config.device)
            attention_mask = batch['attention_mask'].to(Config.device)
            numerical_features = batch['numerical_features'].to(Config.device)

            p_val, p_aro = model(input_ids, attention_mask, numerical_features)
            val_preds.extend(p_val.cpu().numpy().flatten())
            aro_preds.extend(p_aro.cpu().numpy().flatten())

    # --- [STEP 4] SAVE ---
    submission = pd.DataFrame({
        'user_id': test_proc['user_id'],
        'pred_state_change_valence': val_preds,
        'pred_state_change_arousal': aro_preds
    })
    
    submission.to_csv(Config.OUTPUT_FILE, index=False)
    print(f"DONE! Submission saved to: {Config.OUTPUT_FILE}")
    print(submission.head())

if __name__ == "__main__":
    predict()