In [1]:
import pandas as pd
from preproc import clean_posts, stop_word, filter_by_word_count, add_mbti_binary_columns

In [3]:
df = pd.read_csv('../raw_data/merge_and_clean.csv')

In [4]:
df.head()

Unnamed: 0,type,clean_text,EI,SN,TF,JP,text_length_words
0,INFJ,'. and moments sportscenter not top ten plays ...,0,0,0,1,575
1,ENTP,'i'm finding the lack of me in these posts ver...,1,0,1,0,1191
2,INTP,"'good one. of course, to which i say i know th...",0,0,1,0,861
3,INTJ,"'dear , i enjoyed our conversation the other d...",0,0,1,1,1105
4,ENTJ,'you're fired.. that's another silly misconcep...,1,0,1,1,1000


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from imblearn.combine import SMOTETomek
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [7]:
# TF-IDF
vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 1),
    stop_words=list(ENGLISH_STOP_WORDS)
)
X = vectorizer.fit_transform(df["clean_text"])
y_ie = df["EI"]

In [8]:
# Split
X_train_ie, X_test_ie, y_train_ie, y_test_ie = train_test_split(X, y_ie, test_size=0.2, random_state=42)

In [9]:
# SMOTETomek
smote_tomek = SMOTETomek()
X_res, y_res = smote_tomek.fit_resample(X_train_ie, y_train_ie)

In [None]:
# Train model
model_ie = XGBClassifier(eval_metric='logloss')
model_ie.fit(X_res, y_res)

In [12]:
# Evaluation
y_pred = model_ie.predict(X_test_ie)
print(confusion_matrix(y_test_ie, y_pred))
print(classification_report(y_test_ie, y_pred, digits=3))

[[2117  280]
 [ 694  207]]
              precision    recall  f1-score   support

           0      0.753     0.883     0.813      2397
           1      0.425     0.230     0.298       901

    accuracy                          0.705      3298
   macro avg      0.589     0.556     0.556      3298
weighted avg      0.663     0.705     0.672      3298



In [14]:
y_proba = model_ie.predict_proba(X_test_ie)[:, 1]

for t in [0.2, 0.3, 0.4, 0.5]:
    y_pred_thresh = (y_proba > t).astype(int)
    f1_0 = f1_score(y_test_ie, y_pred_thresh, pos_label=0)
    f1_1 = f1_score(y_test_ie, y_pred_thresh, pos_label=1)
    macro = f1_score(y_test_ie, y_pred_thresh, average='macro')
    print(f"Threshold {t:.1f} | F1-0: {f1_0:.3f}, F1-1: {f1_1:.3f}, Macro-F1: {macro:.3f}")

Threshold 0.2 | F1-0: 0.596, F1-1: 0.463, Macro-F1: 0.529
Threshold 0.3 | F1-0: 0.730, F1-1: 0.420, Macro-F1: 0.575
Threshold 0.4 | F1-0: 0.787, F1-1: 0.359, Macro-F1: 0.573
Threshold 0.5 | F1-0: 0.813, F1-1: 0.298, Macro-F1: 0.556


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# Vectorize
vectorizer = TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1, 2))
X_tfidf = vectorizer.fit_transform(df["clean_text"])
y = df["EI"]

In [None]:
from sklearn.decomposition import TruncatedSVD

# Reduce dimensionality to 300 components (you can tune this)
svd = TruncatedSVD(n_components=300, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

print(f"Original shape: {X_tfidf.shape} → Reduced shape: {X_svd.shape}")

Original shape: (16486, 20000) → Reduced shape: (16486, 300)


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_svd, y, test_size=0.2, random_state=42)

# Optionally apply SMOTETomek
smote_tomek = SMOTETomek()
X_res, y_res = smote_tomek.fit_resample(X_train, y_train)

In [20]:
model = XGBClassifier(eval_metric='logloss', use_label_encoder=False)
model.fit(X_res, y_res)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [21]:
y_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.3).astype(int)  # try best threshold

print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.793     0.647     0.713      2397
           1      0.370     0.552     0.443       901

    accuracy                          0.621      3298
   macro avg      0.582     0.599     0.578      3298
weighted avg      0.678     0.621     0.639      3298



In [None]:
# Reduce dimensionality to 300 components (you can tune this)
svd = TruncatedSVD(n_components=10000, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

print(f"Original shape: {X_tfidf.shape} → Reduced shape: {X_svd.shape}")

X_train, X_test, y_train, y_test = train_test_split(X_svd, y, test_size=0.2, random_state=42)

# Optionally apply SMOTETomek
smote_tomek = SMOTETomek()
X_res, y_res = smote_tomek.fit_resample(X_train, y_train)

model = XGBClassifier(eval_metric='logloss', use_label_encoder=False)
model.fit(X_res, y_res)

y_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.3).astype(int)  # try best threshold

print(classification_report(y_test, y_pred, digits=3))