In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

# === CONFIGURATION ===
# Your exact specific path
BASE_DIR = Path(r"G:\.shortcut-targets-by-id\1MUpwBt8F3Rg0Vkg71wKmZ-cZdV-9kRr7\NLP Project\TRAIN_RELEASE_3SEP2025")
RAW_DATA_PATH = BASE_DIR / "train_subtask1.csv"
SPLIT_OUTPUT_DIR = BASE_DIR / "splits_subtask1"

# Create the output directory if it doesn't exist
SPLIT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"üìÇ Output directory set to: {SPLIT_OUTPUT_DIR}")

# === SPLITTING LOGIC ===
def create_splits():
    print("‚è≥ Loading raw data...")
    df = pd.read_csv(RAW_DATA_PATH)
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # 1. Identify Users
    user_ids = df['user_id'].unique()
    np.random.seed(42)  # Fixed seed for reproducibility
    np.random.shuffle(user_ids)

    # 2. Split Users: 20% Unseen (Test B), 80% Seen
    n_unseen = int(len(user_ids) * 0.2)
    unseen_users = user_ids[:n_unseen]
    seen_users = user_ids[n_unseen:]

    print(f"Total Users: {len(user_ids)}")
    print(f"   - Seen Users: {len(seen_users)} (Will have Train/Val/Test history)")
    print(f"   - Unseen Users: {len(unseen_users)} (Completely held out)")

    # 3. Create 'Unseen' Test Set
    df_test_unseen = df[df['user_id'].isin(unseen_users)].copy()

    # 4. Process 'Seen' Users (Train -> Val -> Test A)
    train_list, val_list, test_seen_list = [], [], []

    for uid in seen_users:
        # Sort strictly by time
        user_df = df[df['user_id'] == uid].sort_values('timestamp')
        n = len(user_df)

        # 80% Train, 10% Val, 10% Test (Forecasting)
        idx_train = int(n * 0.8)
        idx_val = int(n * 0.9)

        train_list.append(user_df.iloc[:idx_train])
        val_list.append(user_df.iloc[idx_train:idx_val])
        test_seen_list.append(user_df.iloc[idx_val:])

    df_train = pd.concat(train_list)
    df_val = pd.concat(val_list)
    df_test_seen = pd.concat(test_seen_list)

    # === SAVE TO NEW FOLDER ===
    print("üíæ Saving splits...")
    df_train.to_csv(SPLIT_OUTPUT_DIR / "train.csv", index=False)
    df_val.to_csv(SPLIT_OUTPUT_DIR / "val.csv", index=False)
    df_test_seen.to_csv(SPLIT_OUTPUT_DIR / "test_seen.csv", index=False)
    df_test_unseen.to_csv(SPLIT_OUTPUT_DIR / "test_unseen.csv", index=False)

    print("‚úÖ Done! Files created:")
    print(f"   - {SPLIT_OUTPUT_DIR / 'train.csv'} ({len(df_train)} rows)")
    print(f"   - {SPLIT_OUTPUT_DIR / 'val.csv'} ({len(df_val)} rows)")
    print(f"   - {SPLIT_OUTPUT_DIR / 'test_seen.csv'} ({len(df_test_seen)} rows)")
    print(f"   - {SPLIT_OUTPUT_DIR / 'test_unseen.csv'} ({len(df_test_unseen)} rows)")

create_splits()

üìÇ Output directory set to: G:\.shortcut-targets-by-id\1MUpwBt8F3Rg0Vkg71wKmZ-cZdV-9kRr7\NLP Project\TRAIN_RELEASE_3SEP2025\splits_subtask1
‚è≥ Loading raw data...
Total Users: 137
   - Seen Users: 110 (Will have Train/Val/Test history)
   - Unseen Users: 27 (Completely held out)
üíæ Saving splits...
‚úÖ Done! Files created:
   - G:\.shortcut-targets-by-id\1MUpwBt8F3Rg0Vkg71wKmZ-cZdV-9kRr7\NLP Project\TRAIN_RELEASE_3SEP2025\splits_subtask1\train.csv (1859 rows)
   - G:\.shortcut-targets-by-id\1MUpwBt8F3Rg0Vkg71wKmZ-cZdV-9kRr7\NLP Project\TRAIN_RELEASE_3SEP2025\splits_subtask1\val.csv (235 rows)
   - G:\.shortcut-targets-by-id\1MUpwBt8F3Rg0Vkg71wKmZ-cZdV-9kRr7\NLP Project\TRAIN_RELEASE_3SEP2025\splits_subtask1\test_seen.csv (282 rows)
   - G:\.shortcut-targets-by-id\1MUpwBt8F3Rg0Vkg71wKmZ-cZdV-9kRr7\NLP Project\TRAIN_RELEASE_3SEP2025\splits_subtask1\test_unseen.csv (388 rows)


In [None]:
import torch

# 1. Create a Global User Mapper from your TRAIN set only
train_df = pd.read_csv(SPLIT_OUTPUT_DIR / "train.csv")
KNOWN_USER_IDS = train_df['user_id'].unique().tolist()
# Map real ID -> 1...N.  0 is reserved for "Unknown/Unseen"
USER_TO_IDX = {uid: i+1 for i, uid in enumerate(KNOWN_USER_IDS)}

def get_user_idx(real_user_id):
    """Returns the trained index for a user, or 0 if unseen."""
    return USER_TO_IDX.get(real_user_id, 0)

In [None]:
from torch.utils.data import Dataset

class TestSlidingWindowDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256, seq_length=5):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.seq_length = seq_length

        # Only process users who actually have data in this split
        self.user_ids = df["user_id"].unique()
        self.df = df

        self.samples = []
        self._build_samples()

    def _build_samples(self):
        # Group by user to ensure we don't mix different people's history
        for user_id in self.user_ids:
            # Sort by time strictly
            user_df = self.df[self.df["user_id"] == user_id].sort_values("timestamp").reset_index(drop=True)

            texts = user_df["text_cleaned"].fillna("").tolist() # Handle NaNs if any
            text_ids = user_df["text_id"].tolist()
            valences = user_df["valence"].tolist()
            arousals = user_df["arousal"].tolist()

            # We must predict for EVERY text in this user's sequence
            for i in range(len(texts)):
                # Logic: We need the current text + previous 4 texts
                start_idx = max(0, i - self.seq_length + 1)
                window_texts = texts[start_idx : i + 1]

                # Padding: If we are at the start (e.g. 1st text), pad the left with empty strings
                if len(window_texts) < self.seq_length:
                    pad_len = self.seq_length - len(window_texts)
                    window_texts = [""] * pad_len + window_texts

                self.samples.append({
                    "texts": window_texts,
                    "target_valence": valences[i],
                    "target_arousal": arousals[i],
                    "text_id": text_ids[i],
                    "real_user_id": user_id
                })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]

        input_ids_list = []
        attention_masks_list = []

        for text in item["texts"]:
            enc = self.tokenizer(
                text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )
            input_ids_list.append(enc["input_ids"].squeeze(0))
            attention_masks_list.append(enc["attention_mask"].squeeze(0))

        # HANDLE USER ID MAPPING HERE
        # If the user is unseen, this returns 0
        mapped_user_idx = get_user_idx(item["real_user_id"])

        return {
            "input_ids": torch.stack(input_ids_list),
            "attention_mask": torch.stack(attention_masks_list),
            "user_id": torch.tensor(mapped_user_idx, dtype=torch.long),
            "valence": torch.tensor(item["target_valence"], dtype=torch.float),
            "arousal": torch.tensor(item["target_arousal"], dtype=torch.float),
            "text_id": item["text_id"]
        }

In [None]:
# === UPDATED CONFIG FOR YOUR LAPTOP ===

# 1. Point to the new splits
DATA_DIR = Path(r"G:\.shortcut-targets-by-id\1MUpwBt8F3Rg0Vkg71wKmZ-cZdV-9kRr7\NLP Project\TRAIN_RELEASE_3SEP2025\splits_subtask1")
TRAIN_PATH = DATA_DIR / "train.csv"
VAL_PATH = DATA_DIR / "val.csv"
TEST_SEEN_PATH = DATA_DIR / "test_seen.csv"
TEST_UNSEEN_PATH = DATA_DIR / "test_unseen.csv"

# 2. Adjust Model Initialization for Unknown Users
# +1 accounts for the 0-index we reserved for Unseen Users
NUM_TOTAL_USERS = len(KNOWN_USER_IDS) + 1

print(f"Model will handle {NUM_TOTAL_USERS} user embeddings (Index 0 = Unknown/Unseen)")

model = EmotionPredictionModel(num_users=NUM_TOTAL_USERS).to(device)

# 3. Important: When creating datasets, use the TestSlidingWindowDataset for Val and Test
#    (This ensures accurate evaluation)
val_dataset = TestSlidingWindowDataset(pd.read_csv(VAL_PATH), tokenizer)
test_seen_dataset = TestSlidingWindowDataset(pd.read_csv(TEST_SEEN_PATH), tokenizer)
test_unseen_dataset = TestSlidingWindowDataset(pd.read_csv(TEST_UNSEEN_PATH), tokenizer)

Model will handle 111 user embeddings (Index 0 = Unknown/Unseen)


NameError: name 'EmotionPredictionModel' is not defined

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel

# === MODEL CONFIGURATION (Must match what we used before) ===
MODEL_NAME = "distilbert-base-uncased"
LSTM_HIDDEN = 128
USER_EMBED_DIM = 32

class EmotionPredictionModel(nn.Module):
    """
    BERT encoder + LSTM temporal modeling + user embeddings,
    with separate regression heads for valence & arousal.
    """
    def __init__(
        self,
        num_users: int,
        bert_model: str = MODEL_NAME,
        lstm_hidden: int = LSTM_HIDDEN,
        user_embed_dim: int = USER_EMBED_DIM,
    ):
        super().__init__()

        # 1. Pretrained BERT encoder
        self.bert = AutoModel.from_pretrained(bert_model)
        bert_dim = self.bert.config.hidden_size

        # 2. User embeddings (The "ID Card" for every user)
        self.user_embedding = nn.Embedding(num_users, user_embed_dim)

        # 3. LSTM over sequences (The "Time Traveler")
        # Input = BERT vector (768) + User vector (32) = 800
        self.lstm = nn.LSTM(
            input_size=bert_dim + user_embed_dim,
            hidden_size=lstm_hidden,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
            dropout=0.1,
        )

        # 4. Regression heads (The "Decision Makers")
        # Input is LSTM_HIDDEN * 2 because it is bidirectional
        self.valence_head = nn.Sequential(
            nn.Linear(lstm_hidden * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
        )

        self.arousal_head = nn.Sequential(
            nn.Linear(lstm_hidden * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
        )

    def forward(self, input_ids, attention_mask, user_id):
        """
        input_ids: [batch, seq_len, max_len]
        """
        batch_size, seq_len, max_len = input_ids.shape

        # A. Encode text with BERT
        # Flatten to [batch*seq_len, max_len] because BERT takes 2D inputs
        input_ids_flat = input_ids.view(-1, max_len)
        attn_mask_flat = attention_mask.view(-1, max_len)

        bert_outputs = self.bert(input_ids_flat, attention_mask=attn_mask_flat)
        cls_embeds = bert_outputs.last_hidden_state[:, 0, :]          # [batch*seq_len, hidden]
        text_embeds = cls_embeds.view(batch_size, seq_len, -1)        # [batch, seq_len, hidden]

        # B. Add User Embeddings
        # Expand user_id from [batch] to [batch, seq_len, 1]
        user_embeds = self.user_embedding(user_id).unsqueeze(1).expand(-1, seq_len, -1)

        # Combine Text + User
        combined = torch.cat([text_embeds, user_embeds], dim=-1)      # [batch, seq_len, 800]

        # C. Run LSTM
        lstm_out, _ = self.lstm(combined)                             # [batch, seq_len, 256]

        # D. Predict
        valence = self.valence_head(lstm_out).squeeze(-1)            # [batch, seq_len]
        arousal = self.arousal_head(lstm_out).squeeze(-1)            # [batch, seq_len]

        return valence, arousal

ModuleNotFoundError: No module named 'transformers'

In [None]:
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import logging
from pathlib import Path
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

# === 1. CONFIGURATION ===
BATCH_SIZE = 8           # Safe for Inference on 8GB VRAM
SEQ_LENGTH = 10
MAX_SEQ_LEN = 256
BASE_DIR = Path(r"G:\.shortcut-targets-by-id\1MUpwBt8F3Rg0Vkg71wKmZ-cZdV-9kRr7\NLP Project\TRAIN_RELEASE_3SEP2025")
SPLIT_OUTPUT_DIR = BASE_DIR / "splits_subtask1"
BEST_MODEL_PATH = "best_model.pt"

# Device Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# === 2. RE-DEFINE CLASSES (Must match training exactly) ===
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Reconstruct User Mapping
if (SPLIT_OUTPUT_DIR / "train.csv").exists():
    train_df_temp = pd.read_csv(SPLIT_OUTPUT_DIR / "train.csv")
    KNOWN_USER_IDS = train_df_temp['user_id'].unique().tolist()
    USER_TO_IDX = {uid: i+1 for i, uid in enumerate(KNOWN_USER_IDS)}
    logger.info(f"‚úÖ Loaded {len(KNOWN_USER_IDS)} users from disk.")
else:
    raise FileNotFoundError("Splits not found!")

def get_user_idx(real_id): return USER_TO_IDX.get(real_id, 0)

class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, seq_length=10, is_test=False):
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        self.samples = []
        for uid in df['user_id'].unique():
            u_df = df[df['user_id'] == uid].sort_values('timestamp').reset_index(drop=True)
            texts = u_df["text_cleaned"].fillna("").tolist()

            # For Test/Eval: Predict for EVERY text using padding
            # For Train: Only full sequences
            iterator = range(len(texts)) if is_test else range(len(texts) - seq_length + 1)

            for i in iterator:
                if is_test:
                    start = max(0, i - seq_length + 1)
                    window_texts = texts[start : i+1]
                    if len(window_texts) < seq_length:
                        window_texts = [""] * (seq_length - len(window_texts)) + window_texts

                    self.samples.append({
                        "texts": window_texts,
                        "v": u_df.iloc[i]["valence"], "a": u_df.iloc[i]["arousal"],
                        "uid": uid
                    })
                else:
                    if len(texts) < seq_length: continue
                    window = u_df.iloc[i : i+seq_length]
                    self.samples.append({
                        "texts": window["text_cleaned"].fillna("").tolist(),
                        "v": window["valence"].tolist(), "a": window["arousal"].tolist(),
                        "uid": uid
                    })

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        item = self.samples[idx]
        input_ids, masks = [], []
        for text in item["texts"]:
            enc = self.tokenizer(text, max_length=MAX_SEQ_LEN, padding="max_length", truncation=True, return_tensors="pt")
            input_ids.append(enc["input_ids"].squeeze(0))
            masks.append(enc["attention_mask"].squeeze(0))

        return {
            "input_ids": torch.stack(input_ids),
            "attention_mask": torch.stack(masks),
            "user_id": torch.tensor(get_user_idx(item["uid"]), dtype=torch.long),
            "valence": torch.tensor(item["v"], dtype=torch.float),
            "arousal": torch.tensor(item["a"], dtype=torch.float)
        }

class EmotionModel(nn.Module):
    def __init__(self, num_users):
        super().__init__()
        self.bert = AutoModel.from_pretrained("distilbert-base-uncased")
        self.user_emb = nn.Embedding(num_users, 32)
        self.lstm = nn.LSTM(768+32, 128, batch_first=True, bidirectional=True, dropout=0.1)
        self.head_v = nn.Sequential(nn.Linear(256, 64), nn.ReLU(), nn.Linear(64, 1))
        self.head_a = nn.Sequential(nn.Linear(256, 64), nn.ReLU(), nn.Linear(64, 1))

    def forward(self, input_ids, mask, uid):
        b, s, m = input_ids.shape
        bert_out = self.bert(input_ids.view(-1, m), mask.view(-1, m)).last_hidden_state[:, 0, :]
        text_emb = bert_out.view(b, s, -1)
        user_emb = self.user_emb(uid).unsqueeze(1).expand(-1, s, -1)
        lstm_out, _ = self.lstm(torch.cat([text_emb, user_emb], dim=-1))
        return self.head_v(lstm_out).squeeze(-1), self.head_a(lstm_out).squeeze(-1)

# === 3. LOAD THE TRAINED WEIGHTS ===
logger.info("‚è≥ Loading Best Model...")
best_model = EmotionModel(len(KNOWN_USER_IDS) + 1).to(device)

if os.path.exists(BEST_MODEL_PATH):
    best_model.load_state_dict(torch.load(BEST_MODEL_PATH))
    logger.info("‚úÖ Model Weights Loaded Successfully!")
else:
    logger.error("‚ùå best_model.pt not found! Did you delete it?")

INFO:__main__:‚úÖ Loaded 110 users from disk.
INFO:__main__:‚è≥ Loading Best Model...
  best_model.load_state_dict(torch.load(BEST_MODEL_PATH))
INFO:__main__:‚úÖ Model Weights Loaded Successfully!


In [None]:
from scipy.stats import pearsonr

# Directory for results
OUTPUT_DIR = Path("predictions")
OUTPUT_DIR.mkdir(exist_ok=True)

def run_evaluation(model, dataset_path, split_name):
    logger.info(f"üîé Evaluating on {split_name}...")

    # Load Data with is_test=True (Ensures we predict for EVERY text)
    df = pd.read_csv(dataset_path)
    ds = EmotionDataset(df, tokenizer, seq_length=SEQ_LENGTH, is_test=True)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False)

    model.eval()
    all_preds_v, all_preds_a = [], []
    all_true_v, all_true_a = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc=f"Predicting {split_name}"):
            input_ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            uid = batch["user_id"].to(device)

            # Forward Pass
            out_v, out_a = model(input_ids, mask, uid)

            # We only care about the LAST item in the sequence for inference
            last_v = out_v[:, -1].cpu().numpy()
            last_a = out_a[:, -1].cpu().numpy()

            all_preds_v.extend(last_v)
            all_preds_a.extend(last_a)
            all_true_v.extend(batch["valence"].numpy())
            all_true_a.extend(batch["arousal"].numpy())

    # Metrics
    corr_v, _ = pearsonr(all_true_v, all_preds_v)
    corr_a, _ = pearsonr(all_true_a, all_preds_a)

    logger.info(f"üìä {split_name} RESULTS:")
    logger.info(f"   Valence Correlation: {corr_v:.4f}")
    logger.info(f"   Arousal Correlation: {corr_a:.4f}")

    # Save to CSV
    result_df = pd.DataFrame({
        "true_valence": all_true_v, "pred_valence": all_preds_v,
        "true_arousal": all_true_a, "pred_arousal": all_preds_a
    })
    result_df.to_csv(OUTPUT_DIR / f"pred_{split_name}.csv", index=False)
    logger.info(f"üíæ Saved to {OUTPUT_DIR / f'pred_{split_name}.csv'}\n")

# Run Evaluation
run_evaluation(best_model, SPLIT_OUTPUT_DIR / "test_seen.csv", "TEST_SEEN")
run_evaluation(best_model, SPLIT_OUTPUT_DIR / "test_unseen.csv", "TEST_UNSEEN")

logger.info("‚úÖ All done!")

INFO:__main__:üîé Evaluating on TEST_SEEN...
Predicting TEST_SEEN: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [00:10<00:00,  3.24it/s]
INFO:__main__:üìä TEST_SEEN RESULTS:
INFO:__main__:   Valence Correlation: 0.7026
INFO:__main__:   Arousal Correlation: 0.5186
INFO:__main__:üíæ Saved to predictions\pred_TEST_SEEN.csv

INFO:__main__:üîé Evaluating on TEST_UNSEEN...
Predicting TEST_UNSEEN: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 60/60 [00:18<00:00,  3.24it/s]
INFO:__main__:üìä TEST_UNSEEN RESULTS:
INFO:__main__:   Valence Correlation: 0.6386
INFO:__main__:   Arousal Correlation: 0.4241
INFO:__main__:üíæ Saved to predictions\pred_TEST_UNSEEN.csv

INFO:__main__:‚úÖ All done!
