In [None]:
import os
import json
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import torch
from torch.utils.data impearly-risk-predictionort Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.optim import AdamW
os.makedirs("Features", exist_ok=True)
os.makedirs(os.path.dirname(output_file), exist_ok=True)

In [None]:
# pull all json files and combine into a single csv file
json_folder = "final-eriskt2-dataset-with-ground-truth/all_combined"
label_file = "final-eriskt2-dataset-with-ground-truth/shuffled_ground_truth_labels.txt"
output_file = "Results/combined_dataset.csv"


labels = {}
with open(label_file, "r", encoding="utf-8") as f:
    for line in f:
        user_id, label = line.strip().split()
        labels[user_id] = 1 if label == "1" else 0


header = [
    "user_id", "target", "type", "title", "body",
    "created_utc", "submission_id", "parent_id",
    "comment_id", "Depression"
]

rows = []


def safe_text(s):

    if isinstance(s, str):
        s = s.strip()
        if not s:
            return None
        try:

            fixed = s.encode('latin1').decode('utf-8')
            return fixed
        except Exception:

            return s
    return s

# run all json files
for filename in os.listdir(json_folder):
    if not filename.endswith(".json"):
        continue

    filepath = os.path.join(json_folder, filename)
    try:
        with open(filepath, "r", encoding="utf-8", errors="replace") as f:
            data = json.load(f)
    except Exception as e:
        continue


    for thread in data:
        # ---------- submission ----------
        sub = thread.get("submission", {})
        sub_user = sub.get("user_id")
        sub_target = sub.get("target", False)
        depression_label = labels.get(sub_user, None) if sub_target else None

        rows.append({
            "user_id": sub_user,
            "target": sub_target,
            "type": "submission",
            "title": safe_text(sub.get("title")),
            "body": safe_text(sub.get("body")),
            "created_utc": sub.get("created_utc"),
            "submission_id": sub.get("submission_id"),
            "parent_id": sub.get("parent_id"),
            "comment_id": None,
            "Depression": depression_label
        })

        # ---------- comments ----------
        for c in thread.get("comments", []):
            c_user = c.get("user_id")
            c_target = c.get("target", False)
            depression_label = labels.get(c_user, None) if c_target else None

            rows.append({
                "user_id": c_user,
                "target": c_target,
                "type": "comment",
                "title": None,
                "body": safe_text(c.get("body")),
                "created_utc": c.get("created_utc"),
                "submission_id": c.get("submission_id"),
                "parent_id": c.get("parent_id"),
                "comment_id": c.get("comment_id"),
                "Depression": depression_label
            })


with open(output_file, "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=header)
    writer.writeheader()
    writer.writerows(rows)


In [None]:
# add parent text column and filter to target=True

df = pd.read_csv("Results/combined_dataset.csv")

# build lookup dicts
sub_dict = df[df['type']=='submission'].set_index('submission_id')['body'].to_dict()
com_dict = df[df['type']=='comment'].set_index('comment_id')['body'].to_dict()

def find_parent(row):
    if row['type'] != 'comment':   
        return None
    
    pid = row['parent_id']
    if pd.isna(pid):
        return None
    
    if pid in sub_dict:
        return sub_dict[pid]
    elif pid in com_dict:
        return com_dict[pid]
    else:
        return None

df['parent'] = df.apply(find_parent, axis=1)

df= df[df["target"] == True].copy()

df.to_csv("Results/combined_dataset_with_parent.csv", index=False)

print("\nNew file saved as: Results/combined_dataset_with_parent.csv")
print("Rows:", len(df))
print(df.head())


In [None]:
# count the number of samples for each class

df = pd.read_csv("Results/combined_dataset_with_parent.csv")

count0 = (df["Depression"] == 0).sum()
count1 = (df["Depression"] == 1).sum()

print("Depression = 0  →", count0)
print("Depression = 1  →", count1)
print("\nTotal labeled targets →", count0 + count1)


In [None]:
# user-level counts

df = pd.read_csv("Results/combined_dataset_with_parent.csv")

df_labeled = df[df["Depression"].notna()].copy()

depressed_users = df_labeled[df_labeled["Depression"] == 1]["user_id"].nunique()
control_users   = df_labeled[df_labeled["Depression"] == 0]["user_id"].nunique()
total_users     = df_labeled["user_id"].nunique()

print(" User-level counts:")
print(f"Depression = 1 users : {depressed_users}")
print(f"Depression = 0 users : {control_users}")
print(f"Total labeled users  : {total_users}")


In [None]:
# train/test split at user level

df = pd.read_csv("Results/combined_dataset_with_parent.csv")


# all depression target users
users_pos = df[df["Depression"] == 1]["user_id"].unique()
# all control target users
users_neg = df[df["Depression"] == 0]["user_id"].unique()

# assign train/test split per user
train_pos, test_pos = train_test_split(users_pos, test_size=0.2, random_state=42)
train_neg, test_neg = train_test_split(users_neg, test_size=0.2, random_state=42)

# put together
train_users = set(train_pos) | set(train_neg)
test_users  = set(test_pos)  | set(test_neg)

# filter posts by user_id
train_df = df[df["user_id"].isin(train_users)].copy()
test_df  = df[df["user_id"].isin(test_users)].copy()

# delete some Depression=0 item to reduce size and balance the dataset

# depression=0 index
train_zero_idx = train_df.index[train_df["Depression"] == 0]
print("Train: num of Dep=0", len(train_zero_idx))

test_zero_idx = test_df.index[test_df["Depression"] == 0]
print("Test: num of Dep=0", len(test_zero_idx))
# randomly select num to remove
train_remove_idx = np.random.choice(train_zero_idx, size=200000, replace=False)
# delete them
train_df_filtered = train_df.drop(train_remove_idx).reset_index(drop=True)
print("Train before:", train_df.shape)
print("Train after:", train_df_filtered.shape)
train_df=train_df_filtered

test_remove_idx = np.random.choice(test_zero_idx, size=40000, replace=False)
# delete them
test_df_filtered = test_df.drop(test_remove_idx).reset_index(drop=True)
print("Test before:", test_df.shape)
print("Test after:", test_df_filtered.shape)
test_df=test_df_filtered

train_df.to_csv("Results/train_target_with_parent.csv", index=False)
test_df.to_csv("Results/test_target_with_parent.csv", index=False)

print(" Train - Test Split")
print(f"Train users: {train_df['user_id'].nunique()} | Posts: {len(train_df)}")
print(f"Test  users: {test_df['user_id'].nunique()} | Posts: {len(test_df)}")
print(f"Dep1-train={sum(train_df.Depression==1)}, Dep1-test={sum(test_df.Depression==1)}")
print(f"Dep0-train={sum(train_df.Depression==0)}, Dep0-test={sum(test_df.Depression==0)}")


In [None]:
# generate LIWC features
!export NLTK_DATA="/nfs/u50/zhanh279/4Z03/jupyter/nltk_data"
!python liwc_script.py  --data Results/train_target_with_parent.csv --column body --save train_LIWC_target
!python liwc_script.py  --data Results/train_target_with_parent.csv --column parent --save train_LIWC_parent
!python liwc_script.py  --data Results/test_target_with_parent.csv  --column body --save test_LIWC_target
!python liwc_script.py  --data Results/test_target_with_parent.csv  --column parent --save test_LIWC_parent

In [None]:
# generate LSM features based on LIWC features for train and test

############################## Training set ##############################
# Load data
body_df = pd.read_pickle("/u50/zhanh279/4Z03/jupyter/Results/train_LIWC_target.fullframe.pickle")
parent_df = pd.read_pickle("/u50/zhanh279/4Z03/jupyter/Results/train_LIWC_parent.fullframe.pickle")


body_df = body_df[body_df['type'] == 'comment'].reset_index(drop=True)
parent_df = parent_df[parent_df['type'] == 'comment'].reset_index(drop=True)



# define LIWC columns

liwc_cols = ["ARTICLE", "AUXVERB", "CONJ", "ADVERB", "PPRON", "IPRON", "PREP", "NEGATE", "QUANT", "VERB"]
social_cols = ["SOCIAL","FRIEND","FAMILY","WE","YOU","THEY","AFFILIATION","DRIVES","POWER","RELIG","HEALTH","WORK","MONEY"]
positive_emo_cols = ["POSEMO","REWARD","ACHIEV","JOY","AFFILIATION","CERTAIN","POWER"]
negative_emo_cols = ["NEGEMO","SAD","ANX","ANGER","RISK","DEATH","FEAR","DISCREP","TENTAT"]
first_person_singular_cols = ["I"]
second_person_cols = ["YOU"]
third_person_singular_cols = ["SHEHE"]
third_person_plural_cols = ["THEY"]   
cognitive_process_cols = ["CAUSE","DISCREP","INSIGHT","CERTAIN","COGPROC"]
perceptual_process_cols = ["SEE","HEAR","FEEL","PERCEPT"]

# all needed LIWC numerical columns
all_needed_cols = sorted(list(set(
      liwc_cols
    + social_cols
    + positive_emo_cols
    + negative_emo_cols
    + first_person_singular_cols
    + second_person_cols
    + third_person_singular_cols
    + third_person_plural_cols
    + cognitive_process_cols
    + perceptual_process_cols
)))

# filter dfs early
needed_cols = ["comment_id"] + all_needed_cols + ["user_id","body","type","Depression","created_utc","submission_id"]

body_df = body_df[needed_cols]
parent_df = parent_df[needed_cols]

# compute LSM
EPS = 1e-6
LSM_df = pd.DataFrame()

for c in all_needed_cols:
    LSM_df[f"LSM_{c}"] = 1 - abs(body_df[c] - parent_df[c]) / (
        body_df[c] + parent_df[c] + EPS
    )

# group means
LSM_df["LSM_mean"] = LSM_df[[f"LSM_{c}" for c in liwc_cols]].mean(axis=1)
LSM_df["social_mean"] = LSM_df[[f"LSM_{c}" for c in social_cols]].mean(axis=1)
LSM_df["positive_mean"] = LSM_df[[f"LSM_{c}" for c in positive_emo_cols]].mean(axis=1)
LSM_df["negative_mean"] = LSM_df[[f"LSM_{c}" for c in negative_emo_cols]].mean(axis=1)
LSM_df["first_person_singular"] = LSM_df[[f"LSM_{c}" for c in first_person_singular_cols]].mean(axis=1)
LSM_df["second_person"] = LSM_df[[f"LSM_{c}" for c in second_person_cols]].mean(axis=1)
LSM_df["third_person_singular"] = LSM_df[[f"LSM_{c}" for c in third_person_singular_cols]].mean(axis=1)
LSM_df["third_person_plural"] = LSM_df[[f"LSM_{c}" for c in third_person_plural_cols]].mean(axis=1)
LSM_df["cognitive_process"] = LSM_df[[f"LSM_{c}" for c in cognitive_process_cols]].mean(axis=1)
LSM_df["perceptual_process"] = LSM_df[[f"LSM_{c}" for c in perceptual_process_cols]].mean(axis=1)

# final output
result_df = pd.concat([
    body_df[["user_id","body","type","Depression","created_utc","comment_id","submission_id"]],
    parent_df["body"].rename("parent_body"),
    LSM_df[
        ["LSM_mean","social_mean","positive_mean","negative_mean",
         "first_person_singular","second_person","third_person_singular",
         "third_person_plural","cognitive_process","perceptual_process"]
    ]
], axis=1)
count_submission = (result_df["type"] == "submission").sum()
print("Number of submissions in training LSM features:", count_submission)
result_df.to_pickle("Results/train_LSM_features.pickle")
print("Train Saved:", len(result_df))


############################## Testing set ##############################
# Load data
body_df = pd.read_pickle("/u50/zhanh279/4Z03/jupyter/Results/test_LIWC_target.fullframe.pickle")
parent_df = pd.read_pickle("/u50/zhanh279/4Z03/jupyter/Results/test_LIWC_parent.fullframe.pickle")
body_df = body_df[body_df['type'] == 'comment']
parent_df = parent_df[parent_df['type'] == 'comment']
# define LIWC columns

liwc_cols = ["ARTICLE", "AUXVERB", "CONJ", "ADVERB", "PPRON", "IPRON", "PREP", "NEGATE", "QUANT", "VERB"]
social_cols = ["SOCIAL","FRIEND","FAMILY","WE","YOU","THEY","AFFILIATION","DRIVES","POWER","RELIG","HEALTH","WORK","MONEY"]
positive_emo_cols = ["POSEMO","REWARD","ACHIEV","JOY","AFFILIATION","CERTAIN","POWER"]
negative_emo_cols = ["NEGEMO","SAD","ANX","ANGER","RISK","DEATH","FEAR","DISCREP","TENTAT"]
first_person_singular_cols = ["I"]
second_person_cols = ["YOU"]
third_person_singular_cols = ["SHEHE"]
third_person_plural_cols = ["THEY"]   
cognitive_process_cols = ["CAUSE","DISCREP","INSIGHT","CERTAIN","COGPROC"]
perceptual_process_cols = ["SEE","HEAR","FEEL","PERCEPT"]

# all needed LIWC numerical columns
all_needed_cols = sorted(list(set(
      liwc_cols
    + social_cols
    + positive_emo_cols
    + negative_emo_cols
    + first_person_singular_cols
    + second_person_cols
    + third_person_singular_cols
    + third_person_plural_cols
    + cognitive_process_cols
    + perceptual_process_cols
)))

# filter dfs early
needed_cols = ["comment_id"] + all_needed_cols + ["user_id","body","type","Depression","created_utc","submission_id"]

body_df = body_df[needed_cols]
parent_df = parent_df[needed_cols]

# compute LSM
EPS = 1e-6
LSM_df = pd.DataFrame()

for c in all_needed_cols:
    LSM_df[f"LSM_{c}"] = 1 - abs(body_df[c] - parent_df[c]) / (
        body_df[c] + parent_df[c] + EPS
    )

# group means
LSM_df["LSM_mean"] = LSM_df[[f"LSM_{c}" for c in liwc_cols]].mean(axis=1)
LSM_df["social_mean"] = LSM_df[[f"LSM_{c}" for c in social_cols]].mean(axis=1)
LSM_df["positive_mean"] = LSM_df[[f"LSM_{c}" for c in positive_emo_cols]].mean(axis=1)
LSM_df["negative_mean"] = LSM_df[[f"LSM_{c}" for c in negative_emo_cols]].mean(axis=1)
LSM_df["first_person_singular"] = LSM_df[[f"LSM_{c}" for c in first_person_singular_cols]].mean(axis=1)
LSM_df["second_person"] = LSM_df[[f"LSM_{c}" for c in second_person_cols]].mean(axis=1)
LSM_df["third_person_singular"] = LSM_df[[f"LSM_{c}" for c in third_person_singular_cols]].mean(axis=1)
LSM_df["third_person_plural"] = LSM_df[[f"LSM_{c}" for c in third_person_plural_cols]].mean(axis=1)
LSM_df["cognitive_process"] = LSM_df[[f"LSM_{c}" for c in cognitive_process_cols]].mean(axis=1)
LSM_df["perceptual_process"] = LSM_df[[f"LSM_{c}" for c in perceptual_process_cols]].mean(axis=1)

# final output
result_df = pd.concat([
    body_df[["user_id","body","type","Depression","created_utc","comment_id","submission_id"]],
    parent_df["body"].rename("parent_body"),
    LSM_df[
        ["LSM_mean","social_mean","positive_mean","negative_mean",
         "first_person_singular","second_person","third_person_singular",
         "third_person_plural","cognitive_process","perceptual_process"]
    ]
], axis=1)
result_df = result_df[result_df['type'] == 'comment']
result_df.to_pickle("Results/test_LSM_features.pickle")
print("Test Saved:", len(result_df))

In [None]:
# Adding features for LSM

lsm_train=pd.read_pickle("Results/train_LSM_features.pickle")
lsm_test=pd.read_pickle("Results/test_LSM_features.pickle")

# add features
lsm_features = [
    'LSM_mean', 'social_mean', 'positive_mean', 'negative_mean',
    'first_person_singular', 'second_person',
    'third_person_singular', 'third_person_plural',
    'cognitive_process', 'perceptual_process'
]
def add_timeseries_features(df, feature_cols, window=5):

    for col in feature_cols:

        # max so far
        df[f"{col}_max_so_far"] = df.groupby("user_id")[col].cummax()

        # min so far
        df[f"{col}_min_so_far"] = df.groupby("user_id")[col].cummin()

        # max gap so far
        df[f"{col}_max_gap_so_far"] = (
            df[f"{col}_max_so_far"] - df[f"{col}_min_so_far"]
        )

        # delta to previous post
        df[f"{col}_delta"] = df.groupby("user_id")[col].diff().fillna(0)

        # rolling standard deviation
        df[f"{col}_rolling_std"] = (
            df.groupby("user_id")[col]
              .rolling(window=window, min_periods=1)
              .std()
              .reset_index(level=0, drop=True)
              .fillna(0)
        )

    return df


lsm_train = add_timeseries_features(lsm_train, lsm_features, window=5)
lsm_test  = add_timeseries_features(lsm_test, lsm_features, window=5)

lsm_train.to_pickle("Features/LSM_features_train.pickle")
lsm_test.to_pickle("Features/LSM_features_test.pickle")

In [None]:
# generate relative entropy features 

# model definition and finetuning code
MODEL_NAME = "distilgpt2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=256):
        self.texts = texts
        self.tok = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tok(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        enc = {k: v.squeeze(0) for k,v in enc.items()}
        enc["labels"] = enc["input_ids"].clone()
        return enc

def finetune_lm(texts, save_path, epochs=3, batch_size=8, lr=5e-5):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token
  

    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
    model.resize_token_embeddings(len(tokenizer))
    ds = TextDataset(texts, tokenizer)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=lr)

    model.train()
    for ep in range(epochs):
        total_loss = 0
        for batch in dl:
            batch = {k: v.to(DEVICE) for k,v in batch.items()}
            optimizer.zero_grad()
            out = model(**batch)
            loss = out.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {ep+1} loss = {total_loss/len(dl):.4f}")

    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    
#get RE
def load_lm(path):
    tok = AutoTokenizer.from_pretrained(path)
    model = AutoModelForCausalLM.from_pretrained(path).to(DEVICE)
    model.eval()
    return tok, model

def calc_loss(model, tok, text):
    enc = tok(text, return_tensors="pt", truncation=True, max_length=256)
    enc = {k: v.to(DEVICE) for k,v in enc.items()}
    with torch.no_grad():
        out = model(**enc, labels=enc["input_ids"])
    # divide by number of tokens
    return out.loss.item() / enc["input_ids"].size(1)


# Finetune language models on training data
train_all_target=pd.read_csv("Results/train_target_with_parent.csv")
train_all_target=train_all_target.dropna(subset=['body'])
train_dep0=train_all_target[train_all_target["Depression"]==0]
train_dep1=train_all_target[train_all_target["Depression"]==1]

train_dep0_texts = train_dep0["body"].astype(str).tolist()
train_dep1_texts = train_dep1["body"].astype(str).tolist()

finetune_lm(train_dep0_texts, "models/train_Dep0_LM", epochs=3)
finetune_lm(train_dep1_texts, "models/train_Dep1_LM", epochs=3)

tok0, lm0 = load_lm("models/train_Dep0_LM")
tok1, lm1 = load_lm("models/train_Dep1_LM")

# calculate RE features for train sets
train_all_target["body"] = train_all_target["body"].astype(str)


loss_dep0 = []
loss_dep1 = []

for text in train_all_target["body"]:
    loss_dep0.append(calc_loss(lm0, tok0, text))
    loss_dep1.append(calc_loss(lm1, tok1, text))

train_all_target["loss_dep0"] = loss_dep0
train_all_target["loss_dep1"] = loss_dep1
train_all_target["re"] = train_all_target["loss_dep1"] - train_all_target["loss_dep0"]

print(train_all_target.columns)

# calculate RE features for test sets
test_all_target=pd.read_csv("Results/test_target_with_parent.csv")
test_all_target=test_all_target.dropna(subset=['body'])
test_all_target["body"] = test_all_target["body"].astype(str)
loss_dep0 = []
loss_dep1 = []
for text in test_all_target["body"]:
    loss_dep0.append(calc_loss(lm0, tok0, text))
    loss_dep1.append(calc_loss(lm1, tok1, text))
test_all_target["loss_dep0"] = loss_dep0
test_all_target["loss_dep1"] = loss_dep1
test_all_target["re"] = test_all_target["loss_dep1"] - test_all_target["loss_dep0"]

print(test_all_target.columns)

In [None]:

# adding features to RE
# maximum value so far
train_all_target["max_re_so_far"] = train_all_target.groupby("user_id")["re"].cummax()

# minimum value so far
train_all_target["min_re_so_far"] = train_all_target.groupby("user_id")["re"].cummin()

# maximum gap so far
train_all_target["max_gap_re_so_far"] = train_all_target["max_re_so_far"] - train_all_target["min_re_so_far"]

# current-last
train_all_target["delta_re"] = train_all_target.groupby("user_id")["re"].diff().fillna(0)

# rolling std
train_all_target["rolling_std_re"] = (
    train_all_target.groupby("user_id")["re"]
         .rolling(window=5, min_periods=1)
         .std()
         .reset_index(level=0, drop=True)
         .fillna(0)
)

# maximum value so far
test_all_target["max_re_so_far"] = test_all_target.groupby("user_id")["re"].cummax()

# minimum value so far
test_all_target["min_re_so_far"] = test_all_target.groupby("user_id")["re"].cummin()

# maximum gap so far
test_all_target["max_gap_re_so_far"] = test_all_target["max_re_so_far"] - test_all_target["min_re_so_far"]

# current-last
test_all_target["delta_re"] = test_all_target.groupby("user_id")["re"].diff().fillna(0)

# rolling std
test_all_target["rolling_std_re"] = (
    test_all_target.groupby("user_id")["re"]
         .rolling(window=5, min_periods=1)
         .std()
         .reset_index(level=0, drop=True)
         .fillna(0)
)


print(train_all_target.columns)
train_all_target.to_pickle("Features/RE_features_train.pickle")
test_all_target.to_pickle("Features/RE_features_test.pickle")

print(len(train_all_target))
print(len(test_all_target))

In [None]:
# generate Cosine features

# Load data
cos_train=pd.read_csv("Results/train_target_with_parent.csv")
cos_test=pd.read_csv("Results/test_target_with_parent.csv")
cos_train = cos_train[cos_train['type']=='comment'].reset_index(drop=True)
cos_test  = cos_test[cos_test['type']=='comment'].reset_index(drop=True)

# clean NAN
cos_train["body"] = cos_train["body"].fillna("").astype(str)
cos_train["parent"] = cos_train["parent"].fillna("").astype(str)

cos_test["body"] = cos_test["body"].fillna("").astype(str)
cos_test["parent"] = cos_test["parent"].fillna("").astype(str)

print(cos_train.columns)


# Load SentenceTransformer
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)
# Encode & Compute Cosine Similarity
def compute_cos_sim_batch(df, batch_size=256):
    sims = []
    for i in range(0, len(df), batch_size):
        texts_t = df["body"].iloc[i:i+batch_size].tolist()
        texts_p = df["parent"].iloc[i:i+batch_size].tolist()

        emb_t = encoder.encode(texts_t, batch_size=32, convert_to_tensor=True)
        emb_p = encoder.encode(texts_p, batch_size=32, convert_to_tensor=True)

        # compute pairwise but only diagonal
        batch_sims = util.cos_sim(emb_t, emb_p).diagonal().cpu().numpy()
        sims.extend(batch_sims)

    return np.array(sims)

print("Encoding train...")
cos_train["sim"] = compute_cos_sim_batch(cos_train)

print("Encoding test...")
cos_test["sim"] = compute_cos_sim_batch(cos_test)



In [None]:
# Adding features for Cosine Similarity
def add_sim_features(df):
    df["max_sim_so_far"] = df.groupby("user_id")["sim"].cummax()
    df["min_sim_so_far"] = df.groupby("user_id")["sim"].cummin()
    df["max_gap_sim_so_far"] = df["max_sim_so_far"] - df["min_sim_so_far"]
    df["delta_sim"] = df.groupby("user_id")["sim"].diff().fillna(0)
    df["rolling_std_sim"] = (
        df.groupby("user_id")["sim"]
          .rolling(window=5, min_periods=1)
          .std()
          .reset_index(level=0, drop=True)
          .fillna(0)
    )
    return df

cos_train = add_sim_features(cos_train)
cos_test  = add_sim_features(cos_test)

cos_train.to_pickle("Features/cos_sim_features_train.pickle")
cos_test.to_pickle("Features/cos_sim_features_test.pickle")

In [None]:
# Generaget PHQ9 features
!cd extremism
!python item-scoring/item_scoring.py   --custom-dataset ../Results/train_target_with_parent.csv --text-column body --scale PHQ-9_archetype_scale --device cuda
!python item-scoring/item_scoring.py   --custom-dataset ../Results/test_target_with_parent.csv --text-column body --scale PHQ-9_archetype_scale --device cuda

In [4]:
# put features and user info together
phq9_train=pd.read_pickle("Results/train_target_with_parent_PHQ-9_archetype_scale_sim.pickle")
user_info_train=pd.read_csv("Results/train_target_with_parent.csv")
phq9_train=pd.concat([user_info_train, phq9_train], axis=1)

phq9_test=pd.read_pickle("Results/test_target_with_parent_PHQ-9_archetype_scale_sim.pickle")
user_info_test=pd.read_csv("Results/test_target_with_parent.csv")
phq9_test=pd.concat([user_info_test, phq9_test], axis=1)

print(len(phq9_train))

133430


In [None]:
# Add more features for PHQ9

phq_cols = [
    'PHQ-9_archetype_scale.0.sim', 'PHQ-9_archetype_scale.1.sim',
    'PHQ-9_archetype_scale.2.sim', 'PHQ-9_archetype_scale.3.sim',
    'PHQ-9_archetype_scale.4.sim', 'PHQ-9_archetype_scale.5.sim',
    'PHQ-9_archetype_scale.6.sim', 'PHQ-9_archetype_scale.7.sim',
    'PHQ-9_archetype_scale.8.sim'
]

# Adding new features for train set
phq9_train["phq_score"] = phq9_train[phq_cols].mean(axis=1)
phq9_train["max_phq_so_far"] = phq9_train.groupby("user_id")["phq_score"].cummax()
phq9_train["min_phq_so_far"] = phq9_train.groupby("user_id")["phq_score"].cummin()
phq9_train["max_gap"]=phq9_train["max_phq_so_far"]-phq9_train["min_phq_so_far"] 
phq9_train["delta_phq"] = phq9_train.groupby("user_id")["phq_score"].diff()
phq9_train["rolling_std_phq"] = (
    phq9_train.groupby("user_id")["phq_score"].rolling(window=5, min_periods=1).std().reset_index(level=0, drop=True)
)
phq9_train["delta_phq"] = phq9_train["delta_phq"].fillna(0)
phq9_train["rolling_std_phq"] = phq9_train["rolling_std_phq"].fillna(0)
phq9_train["post_index"] = phq9_train.groupby("user_id").cumcount()
for col in phq_cols:
    scale_idx = col.split(".")[1].replace("sim","")  # extract "scaleX"
    new_col = f"max_{scale_idx}_so_far"
    phq9_train[new_col] = phq9_train.groupby("user_id")[col].cummax()

# Adding new features for test set
phq9_test["phq_score"] = phq9_test[phq_cols].mean(axis=1)
phq9_test["max_phq_so_far"] = phq9_test.groupby("user_id")["phq_score"].cummax()
phq9_test["min_phq_so_far"] = phq9_test.groupby("user_id")["phq_score"].cummin()
phq9_test["max_gap"]=phq9_test["max_phq_so_far"]-phq9_test["min_phq_so_far"] 
phq9_test["delta_phq"] = phq9_test.groupby("user_id")["phq_score"].diff()
phq9_test["rolling_std_phq"] = (
    phq9_test.groupby("user_id")["phq_score"].rolling(window=5, min_periods=1).std().reset_index(level=0, drop=True)
)
phq9_test["delta_phq"] = phq9_test["delta_phq"].fillna(0)
phq9_test["rolling_std_phq"] = phq9_test["rolling_std_phq"].fillna(0)
phq9_test["post_index"] = phq9_test.groupby("user_id").cumcount()
for col in phq_cols:
    scale_idx = col.split(".")[1].replace("sim","")  # extract "scaleX"
    new_col = f"max_{scale_idx}_so_far"
    phq9_test[new_col] = phq9_test.groupby("user_id")[col].cummax()

print(phq9_train.columns)
print(phq9_test.columns)
print(len(phq9_train))
print(len(phq9_test))
phq9_train.to_pickle("/u50/zhanh279/4Z03/jupyter/Features/PHQ9_features_train.pickle")
phq9_test.to_pickle("/u50/zhanh279/4Z03/jupyter/Features/PHQ9_features_test.pickle")

print(len(phq9_train))



Index(['user_id', 'target', 'type', 'title', 'body', 'created_utc',
       'submission_id', 'parent_id', 'comment_id', 'Depression', 'parent',
       'PHQ-9_archetype_scale.0.sim', 'PHQ-9_archetype_scale.1.sim',
       'PHQ-9_archetype_scale.2.sim', 'PHQ-9_archetype_scale.3.sim',
       'PHQ-9_archetype_scale.4.sim', 'PHQ-9_archetype_scale.5.sim',
       'PHQ-9_archetype_scale.6.sim', 'PHQ-9_archetype_scale.7.sim',
       'PHQ-9_archetype_scale.8.sim', 'phq_score', 'max_phq_so_far',
       'min_phq_so_far', 'max_gap', 'delta_phq', 'rolling_std_phq',
       'post_index', 'max_0_so_far', 'max_1_so_far', 'max_2_so_far',
       'max_3_so_far', 'max_4_so_far', 'max_5_so_far', 'max_6_so_far',
       'max_7_so_far', 'max_8_so_far'],
      dtype='object')
Index(['user_id', 'target', 'type', 'title', 'body', 'created_utc',
       'submission_id', 'parent_id', 'comment_id', 'Depression', 'parent',
       'PHQ-9_archetype_scale.0.sim', 'PHQ-9_archetype_scale.1.sim',
       'PHQ-9_archetype_scale.