In [14]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset
import joblib
import os

In [15]:
from google.colab import files
uploaded = files.upload()

Saving mbti_chunked_clean.csv to mbti_chunked_clean (1).csv


In [16]:
df = pd.read_csv('mbti_chunked_clean.csv', index_col=0)
df = df[['text', 'label']]
df.head()

Unnamed: 0_level_0,text,label
chunk_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,the pope is infallible this is a catholic dogm...,intj
2,martin said that george floyd was in heaven it...,intj
3,while supporting abortion lol abortion again i...,intj
4,views on predestination exist in the catholic ...,intj
5,a little kitty cat i mean the real dangerousne...,intj


In [17]:
df.shape

(33180, 2)

In [18]:
df = df.groupby('label', group_keys=False).apply(
    lambda x: x.sample(min(len(x), 2000 // df['label'].nunique()), random_state=42)
    ).reset_index(drop=True)
df.shape

  df = df.groupby('label', group_keys=False).apply(


(2000, 2)

In [19]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
enfj,125
enfp,125
entj,125
entp,125
esfj,125
esfp,125
estj,125
estp,125
infj,125
infp,125


In [20]:
X = df.text
y = df.label

In [21]:
from tqdm import tqdm

In [22]:
# 1. Data
X_train, X_test, y_train_raw, y_test_raw = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
le = LabelEncoder()
y_train = le.fit_transform(y_train_raw)
y_test = le.transform(y_test_raw)

# 2. Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_enc = tokenizer(list(X_train), padding=True, truncation=True, return_tensors="pt", max_length=256)
test_enc = tokenizer(list(X_test), padding=True, truncation=True, return_tensors="pt", max_length=256)

train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

train_dataset = TensorDataset(train_enc["input_ids"], train_enc["attention_mask"], train_labels)
test_dataset = TensorDataset(test_enc["input_ids"], test_enc["attention_mask"], test_labels)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4)

# 3. Model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=16)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# 4. Training
model.train()
for epoch in range(3):
    print(f"Epoch {epoch+1}/{3}")
    for input_ids, attention_mask, labels in tqdm(train_loader, desc="Training"):
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        outputs.loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1} done.")

# 5. Save
model.save_pretrained("mbti-bert")
tokenizer.save_pretrained("mbti-bert")
joblib.dump(le, "mbti-bert/label_encoder.pkl")

# 6. Evaluate
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.numpy())

print(classification_report(all_labels, all_preds, target_names=le.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training: 100%|██████████| 400/400 [08:47<00:00,  1.32s/it]


Epoch 1 done.
Epoch 2/3


Training: 100%|██████████| 400/400 [08:42<00:00,  1.31s/it]


Epoch 2 done.
Epoch 3/3


Training: 100%|██████████| 400/400 [08:43<00:00,  1.31s/it]


Epoch 3 done.
              precision    recall  f1-score   support

        enfj       0.00      0.00      0.00        25
        enfp       0.00      0.00      0.00        25
        entj       0.00      0.00      0.00        25
        entp       0.00      0.00      0.00        25
        esfj       0.00      0.00      0.00        25
        esfp       0.00      0.00      0.00        25
        estj       0.00      0.00      0.00        25
        estp       0.00      0.00      0.00        25
        infj       0.00      0.00      0.00        25
        infp       0.00      0.00      0.00        25
        intj       0.00      0.00      0.00        25
        intp       0.06      1.00      0.12        25
        isfj       0.00      0.00      0.00        25
        isfp       0.00      0.00      0.00        25
        istj       0.00      0.00      0.00        25
        istp       0.00      0.00      0.00        25

    accuracy                           0.06       400
   macro avg

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
