In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import numpy as np
import gc, os
import spacy
from spacy.matcher import PhraseMatcher
from tqdm.auto import tqdm
from pandarallel import pandarallel
from sklearn.model_selection import train_test_split

In [6]:
file_map = {
    'birth': 'data/birth_year.csv',
    'extrovert': 'data/extrovert_introvert.csv',
    'feeling': 'data/feeling_thinking.csv',
    'gender': 'data/gender.csv',
    'judging': 'data/judging_perceiving.csv',
    'nationality': 'data/nationality.csv',
    'political': 'data/political_leaning.csv',
    'sensing': 'data/sensing_intuitive.csv'
}

In [7]:
nlp = spacy.load("en_core_web_md")
ruler = nlp.add_pipe("entity_ruler", before="ner")

patterns = [
    {"label": "GENDER", "pattern": [{"LOWER": {"IN": ["he", "him", "his", "she", "her", "hers", "they", "them", "boy", "girl", "man", "woman"]}}]},
    {"label": "GENDER", "pattern": [{"LOWER": "he/him"}]},
    {"label": "GENDER", "pattern": [{"LOWER": "she/her"}]},
]

mbti_labels = ["INFJ", "ENTP", "INTJ", "ENFP", "INFP", "ENFJ", "ISTJ", "ISFJ", 
               "ESTJ", "ESFJ", "ISTP", "ISFP", "ESTP", "ESFP", "INTP", "ENTJ"]
patterns += [{"label": "MBTI_TYPE", "pattern": [{"LOWER": m.lower()}]} for m in mbti_labels]
ruler.add_patterns(patterns)

trait_labels = {
    'extrovert': '[EXTROVERT_TRAIT]',
    'feeling': '[EMPATHY_TRAIT]',
    'judging': '[JUDGE_TRAIT]',
    'sensing': '[SENSE_TRAIT]'
}

trait_vectors = {
    'extrovert': nlp("extrovert").vector,
    'feeling': nlp("empathy").vector,
    'judging': nlp("judgemental").vector,
    'sensing': nlp("sensitive").vector
}

def mask_doc_fast(doc, mode):
    tokens = []
    target_vec = trait_vectors.get(mode)    
    for token in doc:
        if token.ent_type_:
            ent = token.ent_type_
            if (mode == 'nationality' and ent in ("NORP", "GPE")) or \
               (mode == 'birth' and ent == "DATE") or \
               (mode == 'political' and ent == "NORP") or \
               (mode == 'gender' and ent == "GENDER") or \
               (ent == "MBTI_TYPE"):
                tokens.append(f"[{ent}]")
                continue

        if target_vec is not None and token.has_vector:
            sim = np.dot(token.vector, target_vec) / (token.vector_norm * np.linalg.norm(target_vec))
            if sim >= 0.5:
                tokens.append(trait_labels[mode])
                continue

        tokens.append(token.text_with_ws)
    return "".join(tokens)

def text_generator(df, mode, target_vec):
    for text in df['post'].fillna(""):
        yield (str(text), (mode, target_vec))

In [None]:
if not os.path.exists('output'):
    os.makedirs('output')

for mode, file_path in file_map.items():
    print(f"\n--- Processing {mode.upper()} ---")
    df = pd.read_csv(file_path)
    texts = df['post'].fillna("").astype(str).tolist()
    
    processed_texts = []
    with nlp.select_pipes(disable=["parser", "lemmatizer"]):
        pipe = nlp.pipe(texts, n_process=8, batch_size=400)
        
        for doc in tqdm(pipe, total=len(texts), desc=f"Scrubbing"):
            processed_texts.append(mask_doc_fast(doc, mode))

    df['post_masked'] = processed_texts
    
    if 'auhtor_ID' in df.columns:
        df = df.drop(columns=['auhtor_ID'])

    train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42)

    for folder in ['train', 'val', 'test']:
        os.makedirs(f'output/{folder}', exist_ok=True)

    train_df.to_csv(f'output/train/{mode}_train.csv', index=False)
    val_df.to_csv(f'output/val/{mode}_val.csv', index=False)
    test_df.to_csv(f'output/test/{mode}_test.csv', index=False)

    del df, train_df, val_df, test_df, temp_df, texts, processed_texts
    gc.collect()


--- Processing FEELING ---


Scrubbing:   0%|          | 0/39600 [00:00<?, ?it/s]

Split feeling: Train(27720), Val(5940), Test(5940)

--- Processing GENDER ---


Scrubbing:   0%|          | 0/44635 [00:00<?, ?it/s]

Split gender: Train(31244), Val(6695), Test(6696)

--- Processing JUDGING ---


Scrubbing:   0%|          | 0/41365 [00:00<?, ?it/s]