In [2]:
# import necessary libraries
import os
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit


In [6]:
# separate data into train and validation sets based on user ID

FILE_PATH = "../TRAIN_RELEASE_3SEP2025/train_subtask1.csv"
SEED = 42
TRAIN_RATIO = 0.8

# read
df = pd.read_csv(FILE_PATH)

# clean + sort 
df = df.dropna(subset=["user_id", "text_id", "text", "valence", "arousal"]).copy()
df = df.sort_values(["user_id", "timestamp", "text_id"]).reset_index(drop=True)

# group split by user_id
gss = GroupShuffleSplit(n_splits=1, train_size=TRAIN_RATIO, random_state=SEED)
train_idx, val_idx = next(gss.split(df, groups=df["user_id"]))
train_df = df.iloc[train_idx].reset_index(drop=True)
val_df   = df.iloc[val_idx].reset_index(drop=True)

# verify no user leakage
train_users = set(train_df["user_id"].unique())
val_users   = set(val_df["user_id"].unique())
overlap = train_users & val_users
print(f"Rows: train={len(train_df)}, val={len(val_df)}")
print(f"Users: train={len(train_users)}, val={len(val_users)}, overlap={len(overlap)}")
assert len(overlap) == 0, "User leakage! train/val share the same user_id."

#save
train_df.to_pickle("data/train.pickle")
val_df.to_pickle("data/val.pickle")


print("Saved")

Rows: train=2165, val=599
Users: train=109, val=28, overlap=0
Saved
