In [2]:
# import necessary libraries
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from sentence_transformers import SentenceTransformer

# Training

In [3]:
# separate data into train and validation sets randomly (80% train, 20% val)

FILE_PATH = "../TRAIN_RELEASE_3SEP2025/train_subtask1.csv"
SEED = 42
TRAIN_RATIO = 0.8

# read
df = pd.read_csv(FILE_PATH)

# clean + sort 
df = df.dropna(subset=["user_id", "text_id", "text", "valence", "arousal"]).copy()
df = df.sort_values(["user_id", "timestamp", "text_id"]).reset_index(drop=True)

# random split (80% train, 20% val)
train_df, val_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=SEED)

# verify split
print(f"Rows: train={len(train_df)}, val={len(val_df)}")
print(f"Total rows: {len(df)}")

#save
train_df.to_pickle("data/train.pickle")
val_df.to_pickle("data/val.pickle")


print("Saved")

train=pd.read_pickle("data/train.pickle")
val=pd.read_pickle("data/val.pickle")
train.to_csv("data/train.csv", index=False)
val.to_csv("data/val.csv", index=False)

Rows: train=2211, val=553
Total rows: 2764
Saved


## Generate LIWC Features
Run the following commands in terminal or uncomment the code cell below:


In [None]:
# Generate LIWC Features for train and validation
# !python liwc_script.py --data subTask1/data/train.csv --column text --save ../subTask1/data/task1_LIWC_train
# !python liwc_script.py --data subTask1/data/val.csv --column text --save ../subTask1/data/task1_LIWC_val

## Generate Valence and Arousal Similarity Features
Run the following commands in terminal or uncomment the code cell below:


In [None]:
# Generate Valence Arousal similarity Features
# !cd extremism
# !python item-scoring/item_scoring.py --custom-dataset /u50/zhanh279/SemEval-2026/subTask1/data/train.csv --text-column text --scale Valence --device cuda --output ../subTask1/data/task1_ValenceSim_train.pickle
# !python item-scoring/item_scoring.py --custom-dataset /u50/zhanh279/SemEval-2026/subTask1/data/train.csv --text-column text --scale Arousal --device cuda --output ../subTask1/data/task1_ArousalSim_train.pickle
# !python item-scoring/item_scoring.py --custom-dataset /u50/zhanh279/SemEval-2026/subTask1/data/val.csv --text-column text --scale Valence --device cuda --output ../subTask1/data/task1_ValenceSim_val.pickle
# !python item-scoring/item_scoring.py --custom-dataset /u50/zhanh279/SemEval-2026/subTask1/data/val.csv --text-column text --scale Arousal --device cuda --output ../subTask1/data/task1_ArousalSim_val.pickle

In [10]:
train_A=pd.read_pickle('data/task1_ArousalSim_train.pickle')
train_V=pd.read_pickle('data/task1_ValenceSim_train.pickle')
val_A=pd.read_pickle('data/task1_ArousalSim_val.pickle')
val_V=pd.read_pickle('data/task1_ValenceSim_val.pickle')
train=pd.read_pickle('data/train.pickle')
val=pd.read_pickle('data/val.pickle')
print(len(train), len(train_A), len(train_V))

train = pd.concat([train, train_A, train_V], axis=1)
val = pd.concat([val, val_A, val_V], axis=1)
train.to_pickle('data/train+sim.pickle')
val.to_pickle('data/val+sim.pickle')

2211 2211 2211


In [7]:
# Generate embeddings for each text and save with original data
# Load data
train_df = pd.read_pickle("data/train.pickle")
val_df = pd.read_pickle("data/val.pickle")

# Setup device and model
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)

# Generate embeddings for training data
print("Generating embeddings for training data...")
train_embeddings = encoder.encode(
    train_df["text"].astype(str).tolist(),
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=False
)
train_df["embedding"] = [emb.cpu().numpy() if hasattr(emb, 'cpu') else emb for emb in train_embeddings]

# Generate embeddings for validation data
print("Generating embeddings for validation data...")
val_embeddings = encoder.encode(
    val_df["text"].astype(str).tolist(),
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=False
)
val_df["embedding"] = [emb.cpu().numpy() if hasattr(emb, 'cpu') else emb for emb in val_embeddings]

# Save to pickle
train_df.to_pickle("data/task1_embedding_train.pickle")
val_df.to_pickle("data/task1_embedding_val.pickle")

print(f"Train data shape: {train_df.shape}")
print(f"Val data shape: {val_df.shape}")
print(f"Embedding column shape: {train_df['embedding'].iloc[0].shape}")


Using device: cuda
Generating embeddings for training data...


Batches:   0%|          | 0/35 [00:00<?, ?it/s]

Generating embeddings for validation data...


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Train data shape: (2211, 9)
Val data shape: (553, 9)
Embedding column shape: (384,)


# Testing

In [7]:
test_df=pd.read_csv("../TEST_RELEASE_5JAN2026/test_subtask1.csv")
test_df.to_pickle("data/test.pickle")
test_df.to_csv("data/test.csv", index=False)

## Generate LIWC Features for Test Data
Run the following command in terminal or uncomment the code cell below:


In [None]:
# Generate LIWC Features for test data
# !python ../liwc_script.py --data subTask1/data/test.csv --column text --save task1_LIWC_test


## Generate Valence and Arousal Similarity Features for Test Data
Run the following commands in terminal or uncomment the code cell below:


In [None]:
# Generate Valence Arousal similarity Features for test data
# !cd ../extremism
# !python ../extremism/item-scoring/item_scoring.py --custom-dataset /u50/zhanh279/SemEval-2026/subTask1/data/test.csv --text-column text --scale Valence --device cuda
# !python ../extremism/item-scoring/item_scoring.py --custom-dataset /u50/zhanh279/SemEval-2026/subTask1/data/test.csv --text-column text --scale Arousal --device cuda


In [8]:
test_A=pd.read_pickle('data/task1_ArousalSim_test.pickle')
test_V=pd.read_pickle('data/task1_ValenceSim_test.pickle')
test=pd.read_pickle('data/test.pickle')
print(len(test), len(test_A), len(test_V))

test = pd.concat([test, test_A, test_V], axis=1)
test.to_pickle('data/test+sim.pickle')

1737 1737 1737


In [9]:
# Generate embeddings for each text and save with original data
# Load data
test_df = pd.read_pickle("data/test.pickle")

# Setup device and model
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)

# Generate embeddings for test data
print("Generating embeddings for test data...")
test_embeddings = encoder.encode(
    test_df["text"].astype(str).tolist(),
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=False
)
test_df["embedding"] = [emb.cpu().numpy() if hasattr(emb, 'cpu') else emb for emb in test_embeddings]
# Save to pickle
test_df.to_pickle("data/task1_embedding_test.pickle")
print(f"Test data shape: {test_df.shape}")
print(f"Embedding column shape: {test_df['embedding'].iloc[0].shape}")


Using device: cuda
Generating embeddings for test data...


Batches:   0%|          | 0/28 [00:00<?, ?it/s]

Test data shape: (1737, 8)
Embedding column shape: (384,)
