# Replicate previous paper

[This github repository](https://github.com/yashsmehta/personality-prediction#predicting-personality-on-unseen-text) contains code for the paper [Bottom-Up and Top-Down: Predicting Personality with Psycholinguistic and Language Model Features](https://ieeexplore.ieee.org/document/9338428), where the authors propose a novel deep learning-based model which integrates traditional psycholinguistic features with language model embeddings to predict personality from the Essays dataset for Big-Five and Kaggle dataset for MBTI.

We used this paper's result as a baseline of model performance for personality detection.

In [1]:
"""
Try this code only at first time
"""
# Clone the git hub repo onto my Google Drive working folder
#!git clone 'https://github.com/Yuta555/personality-prediction'

Cloning into 'personality-prediction'...
remote: Enumerating objects: 962, done.[K
remote: Counting objects: 100% (962/962), done.[K
remote: Compressing objects: 100% (364/364), done.[K
remote: Total 962 (delta 582), reused 945 (delta 580), pack-reused 0[K
Receiving objects: 100% (962/962), 53.49 MiB | 6.10 MiB/s, done.
Resolving deltas: 100% (582/582), done.
Updating files: 100% (58/58), done.


In [2]:
!pip install -q -U transformers tweet-preprocessor sentencepiece python-dotenv

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import warnings

In [None]:
%cd personality-prediction

## Split data into train/test dataset

In [4]:
SEED = 42

df = pd.read_csv('data/kaggle/kaggle.csv')

train_df, test_df = train_test_split(df, test_size=0.1, random_state=SEED)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv('data/kaggle/kaggle_train.csv', index=False)
test_df.to_csv('data/kaggle/kaggle_test.csv', index=False)

## Extract features from text data using BERT model

In [6]:
!python LM_extractor.py -dataset_type 'kaggle' -token_length 512 -batch_size 32 -embed 'bert-base' -op_dir 'pkl_data' -kaggle_train True

GPU found ( Tesla T4 )
num device avail:  1

kaggle | bert-base | 512 | 512_head | cls

Downloading (…)lve/main/config.json: 100% 570/570 [00:00<00:00, 3.04MB/s]
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/1dbc166cf8765166998eff31ade2eb64c8a40076/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Downloading mo

## Fine-tune detection model (MLP)

In [7]:
!python finetune_models/MLP_LM.py -dataset "kaggle" -save_model "yes"

2023-11-12 23:09:06.877705: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-12 23:09:06.877753: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-12 23:09:06.877795: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
kaggle : bert-base : 11 : 512_head : cls
  saving_api.save_model(
{'acc': [76.82458162307739, 77.59283185005188, 77.08066701889038, 77.46478915214539, 76.69654488563538, 77.20870971679688, 77.20870971679688, 76.92307829856873, 77.43589878082275, 76.92307829856873, 86.29961609840393, 86.17157340049744, 86.17157340049744, 86.17157340049744, 86.17157340049744, 86

## Predict personality on test dataset

In [None]:
#%cd personality-prediction

In [13]:
from .unseen_predictor import predict

embed = "bert-base"
op_dir = "pkl_data/"
token_length = 512
finetune_model = "mlp_lm"
dataset = "kaggle"

tqdm.pandas()

def labeling(pred_dict):
    label = ""
    label += "E" if pred_dict['E'] >= 0.5 else "I"
    label += "N" if pred_dict['N'] >= 0.5 else "S"
    label += "F" if pred_dict['F'] >= 0.5 else "T"
    label += "J" if pred_dict['J'] >= 0.5 else "P"
    return label

df_test = pd.read_csv('data/kaggle/kaggle_test.csv')

preds = df_test['text'].progress_apply(lambda x: predict(x, embed, op_dir, token_length, finetune_model, dataset))
pred_labels = preds.apply(lambda x: labeling(x))
pred_labels.name = "pred_label"

pred_labels.to_csv('explogs/test_prediction.csv', index=False)

100%|██████████| 868/868 [36:02<00:00,  2.49s/it]


## Evaluate the result

In [17]:
from collections import defaultdict

ref_labels = pd.read_csv('data/kaggle/kaggle_test.csv')['type']
pred_labels = pd.read_csv('explogs/test_prediction.csv')['pred_label']

cor_dict = defaultdict(int)
multi_cor_list = [] # list to store 1 if all dimension are correct, else 0

for ref, pred in zip(ref_labels, pred_labels):
    multi_cor = 1
    for i in range(4):
        if ref[i] == pred[i]:
            cor_dict[i] += 1
            multi_cor *= 1
        else:
            multi_cor *= 0

    multi_cor_list.append(multi_cor)

total = len(ref_labels)
acc_dict = {idx: round(cor / total, 4) for idx, cor in sorted(cor_dict.items())}
print(f"Accuracy for each dimension: {acc_dict}")

multi_acc = round(sum(multi_cor_list) / total, 4)
print(f"Accuracy for 16 class classification: {multi_acc}")

Accuracy for each dimension: {0: 0.7247, 1: 0.8606, 2: 0.7166, 3: 0.621}
Accuracy for 16 class classification: 0.2938
