In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig, AutoModel
from sklearn.preprocessing import StandardScaler
import os
import re
import sys
from tqdm.autonotebook import tqdm

# Handle safetensors import
try:
    from safetensors.torch import load_file as safe_load_file
except ImportError:
    print("Warning: 'safetensors' library not found. Will attempt to use torch.load if .bin file exists.")

In [7]:
# ====================================================
# 1. SMART CONFIGURATION
# ====================================================
class Config:
    try:
        BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        BASE_DIR = os.getcwd()
        if "Subtask2b" not in BASE_DIR and os.path.exists(os.path.join(BASE_DIR, "Subtask2b")):
            BASE_DIR = os.path.join(BASE_DIR, "Subtask2b")

    print(f"Working Directory: {BASE_DIR}")
    
    WEIGHTS_DIR = os.path.join(BASE_DIR, "weights")
    DATA_DIR = os.path.join(BASE_DIR, "data")
    OUTPUT_FILE = os.path.join(BASE_DIR, "submission.csv")

    # --- MODEL PARAMETERS ---
    base_model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    max_seq_length = 512
    batch_size = 16
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    num_experts = 4
    top_k = 2

Working Directory: /Users/theson/Documents/SEMEVAL2026FT/Subtask2b


In [8]:
# ====================================================
# 2. HELPER FUNCTIONS
# ====================================================
def clean_text(texts):
    return [re.sub(r"\s+", " ", str(t)).strip() for t in texts]

def calculate_user_features(df):
    # Logic: Tính trung bình valence/arousal của Group 1
    if 'group' not in df.columns:
        # Fallback nếu file test không có cột group (hiếm gặp)
        print("   ⚠️ Warning: 'group' column missing. Using mean of all user rows.")
        user_feats = df.groupby('user_id')[['valence', 'arousal']].mean()
    else:
        g1_df = df[df['group'] == 1]
        user_feats = g1_df.groupby('user_id')[['valence', 'arousal']].mean()
    
    user_feats.columns = ['mean_valence_half1', 'mean_arousal_half1']
    
    # Đảm bảo đủ user
    all_users = df['user_id'].unique()
    user_feats = user_feats.reindex(all_users).fillna(0.0)
    return user_feats

# ====================================================
# 3. MODEL ARCHITECTURE (MEAN POOLING - KHÁC 2A)
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self): super().__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

class Expert(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim), nn.LayerNorm(hidden_dim),
            nn.GELU(), nn.Dropout(0.3), nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x): return self.net(x)

class SparseMoELayer(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_experts=4, top_k=2):
        super().__init__()
        self.num_experts, self.top_k = num_experts, top_k
        self.gate = nn.Linear(input_dim, num_experts)
        self.experts = nn.ModuleList([Expert(input_dim, hidden_dim, output_dim) for _ in range(num_experts)])
        
    def forward(self, x):
        gate_probs = F.softmax(self.gate(x), dim=-1)
        topk_weights, topk_indices = torch.topk(gate_probs, self.top_k, dim=-1)
        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
        all_expert_outputs = torch.stack([exp(x) for exp in self.experts], dim=1) 
        
        batch_size = x.size(0)
        output_dim = all_expert_outputs.size(-1)
        final_output = torch.zeros(batch_size, output_dim, device=x.device)
        
        for k in range(self.top_k):
            idx_k = topk_indices[:, k]
            weight_k = topk_weights[:, k].unsqueeze(1)
            idx_k_expanded = idx_k.view(-1, 1, 1).expand(-1, 1, output_dim)
            val = all_expert_outputs.gather(1, idx_k_expanded)
            final_output += weight_k * val.squeeze(1)
        return final_output

class Subtask2bModel(nn.Module):
    def __init__(self, model_name, num_experts=4, top_k=2):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.backbone = AutoModel.from_pretrained(model_name)
        dim = self.config.hidden_size
        self.pooler = MeanPooling()
        self.valence_head = SparseMoELayer(dim + 2, 256, 1, num_experts, top_k)
        self.arousal_head = SparseMoELayer(dim + 2, 256, 1, num_experts, top_k)

    def forward(self, input_ids, attention_mask, numerical_features):
        outputs = self.backbone(input_ids, attention_mask)
        text_emb = self.pooler(outputs.last_hidden_state, attention_mask)
        combined = torch.cat((text_emb, numerical_features), dim=1)
        return torch.cat((self.valence_head(combined), self.arousal_head(combined)), dim=1)

# ====================================================
# 4. INFERENCE DATASET
# ====================================================
class InferenceDataset(Dataset):
    def __init__(self, df, tokenizer, user_feature_map):
        self.tokenizer = tokenizer
        self.data = []
        self.user_ids = []

        # Tự động fix tên cột text
        possible_names = ['tweet', 'content', 'post', 'sentence', 'message']
        for col in possible_names:
            if col in df.columns and 'text' not in df.columns:
                print(f"   ⚠️ Renaming column '{col}' -> 'text'")
                df = df.rename(columns={col: 'text'})

        for uid, user_df in df.groupby('user_id'):
            self.user_ids.append(uid)
            if 'group' in user_df.columns:
                g1 = user_df[user_df['group'] == 1].sort_values('timestamp')
                g2 = user_df[user_df['group'] == 2].sort_values('timestamp')
            else:
                # Fallback nếu không chia group: lấy nửa đầu nửa sau
                user_df = user_df.sort_values('timestamp')
                mid = len(user_df) // 2
                g1, g2 = user_df.iloc[:mid], user_df.iloc[mid:]
            
            t1_list = clean_text(g1['text'].tail(5).tolist())
            t2_list = clean_text(g2['text'].tail(5).tolist())
            
            text_g1 = " ".join(t1_list) if t1_list else ""
            text_g2 = " ".join(t2_list) if t2_list else ""
            num_feats = user_feature_map.get(uid, [0.0, 0.0])

            self.data.append({'text_g1': text_g1, 'text_g2': text_g2, 'num_feats': num_feats})

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        tokens_g1 = self.tokenizer.encode(item['text_g1'], add_special_tokens=False)
        tokens_g2 = self.tokenizer.encode(item['text_g2'], add_special_tokens=False)
        
        half_len = (Config.max_seq_length - 3) // 2
        if len(tokens_g1) > half_len: tokens_g1 = tokens_g1[-half_len:]
        if len(tokens_g2) > half_len: tokens_g2 = tokens_g2[-half_len:]
            
        input_ids = ([self.tokenizer.cls_token_id] + tokens_g1 + 
                     [self.tokenizer.sep_token_id, self.tokenizer.sep_token_id] + 
                     tokens_g2 + [self.tokenizer.sep_token_id])
        
        attention_mask = [1] * len(input_ids)
        padding_len = Config.max_seq_length - len(input_ids)
        if padding_len > 0:
            input_ids += [self.tokenizer.pad_token_id] * padding_len
            attention_mask += [0] * padding_len
            
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'numerical_features': torch.tensor(item['num_feats'], dtype=torch.float)
        }

In [None]:
# ====================================================
# 5. MAIN FUNCTION
# ====================================================
def predict():
    print("="*60)
    print("STARTING INFERENCE SUBTASK 2B")
    print(f"Working Directory: {Config.BASE_DIR}")
    print("="*60)

    # --- [STEP 0] AUTO-DETECT WEIGHTS ---
    actual_weights_path = Config.WEIGHTS_DIR
    if os.path.exists(Config.WEIGHTS_DIR):
        files = os.listdir(Config.WEIGHTS_DIR)
        has_model = any(f.endswith(".bin") or f.endswith(".safetensors") for f in files)
        if not has_model:
            subfolders = [f for f in files if os.path.isdir(os.path.join(Config.WEIGHTS_DIR, f))]
            if subfolders:
                print(f"Detected nested folder. Going into: {subfolders[0]}")
                actual_weights_path = os.path.join(Config.WEIGHTS_DIR, subfolders[0])
            else:
                print("ERROR: Weights folder empty!"); return
    print(f"Target Weights Path: {actual_weights_path}")

    # --- [STEP 1] LOAD DATA ---
    print(">>> [1/5] Looking for Data Files...")
    if not os.path.exists(Config.DATA_DIR): print(f"ERROR: No data folder!"); return

    all_files = os.listdir(Config.DATA_DIR)
    train_files = [f for f in all_files if "train" in f.lower()]
    test_files = [f for f in all_files if ("forecasting" in f.lower() or "test" in f.lower()) and f not in train_files]

    if not train_files: print(" ERROR: Missing train file (for Scaler)!"); return
    if not test_files: print(" ERROR: Missing test file!"); return
    
    print(f"   Train: {train_files[0]}")
    print(f"   Test:  {test_files[0]}")

    # --- [STEP 2] FIT SCALER ---
    print(">>> [2/5] Fitting Scaler...")
    scaler = StandardScaler()
    df_train = pd.read_csv(os.path.join(Config.DATA_DIR, train_files[0]))
    train_feats = calculate_user_features(df_train)
    scaler.fit(train_feats.fillna(0.0))
    print(" Scaler fitted!")

    # --- [STEP 3] PROCESS TEST DATA ---
    print(">>> [3/5] Processing Test Data...")
    df_test = pd.read_csv(os.path.join(Config.DATA_DIR, test_files[0]))
    
    # Lấy danh sách forecasting user
    if 'is_forecasting_user' in df_test.columns:
        forecasting_users = df_test[df_test['is_forecasting_user'] == True]['user_id'].unique()
        print(f" Found {len(forecasting_users)} forecasting users.")
    else:
        forecasting_users = df_test['user_id'].unique()

    test_feats = calculate_user_features(df_test)
    scaled_feats = scaler.transform(test_feats)
    user_feat_map = {uid: scaled_feats[i] for i, uid in enumerate(test_feats.index)}

    # --- [STEP 4] LOAD MODEL ---
    print(">>> [4/5] Loading Model...")
    tokenizer = None
    try:
        tokenizer = AutoTokenizer.from_pretrained(actual_weights_path, local_files_only=True, use_fast=False)
        print(" Loaded tokenizer (Local/Slow).")
    except Exception as e:
        print(f" Local tokenizer failed: {e}. Downloading base...")
        tokenizer = AutoTokenizer.from_pretrained(Config.base_model_name, use_fast=False)

    model = Subtask2bModel(Config.base_model_name, num_experts=Config.num_experts, top_k=Config.top_k)
    
    w_files = [f for f in os.listdir(actual_weights_path) if f.endswith('.safetensors') or f.endswith('.bin')]
    w_path = os.path.join(actual_weights_path, w_files[0])
    
    if w_path.endswith(".safetensors"):
        model.load_state_dict(safe_load_file(w_path), strict=False)
    else:
        model.load_state_dict(torch.load(w_path, map_location="cpu"), strict=False)
    
    model.to(Config.device).eval()

    # --- [STEP 5] PREDICT & SAVE ---
    print(">>> [5/5] Running Inference...")
    test_ds = InferenceDataset(df_test, tokenizer, user_feat_map)
    test_loader = DataLoader(test_ds, batch_size=Config.batch_size, shuffle=False)

    all_preds = []
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(Config.device)
            attention_mask = batch['attention_mask'].to(Config.device)
            num_feats = batch['numerical_features'].to(Config.device)
            
            outputs = model(input_ids, attention_mask, num_feats)
            all_preds.append(outputs.cpu().numpy())

    all_preds = np.concatenate(all_preds, axis=0)

    # Filter & Save
    full_submission = pd.DataFrame({
        'user_id': test_ds.user_ids,
        'pred_dispo_change_valence': all_preds[:, 0],
        'pred_dispo_change_arousal': all_preds[:, 1]
    })
    
    final_submission = full_submission[full_submission['user_id'].isin(forecasting_users)]
    final_submission.to_csv(Config.OUTPUT_FILE, index=False)
    
    print(f"DONE! Saved to: {Config.OUTPUT_FILE}")
    print(final_submission.head())

if __name__ == "__main__":
    predict()