In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/linkfree_combined.csv')

In [3]:
df1 = pd.read_csv('../data/mbti_1.csv')

In [4]:
def replace_(post):
    return post.replace('|||', ' ')

In [5]:
df.linkfree_combined = df.linkfree_combined.apply(replace_)

In [6]:
df.head()

Unnamed: 0,type,linkfree_combined
0,INFJ,enfp intj moment sportscenter top ten play pra...
1,ENTP,im finding lack post alarming sex boring posit...
2,INTP,good one course say know thats blessing curse ...
3,INTJ,dear intp enjoyed conversation day esoteric ga...
4,ENTJ,youre fired thats another silly misconception ...


In [7]:
df.type.value_counts()

INFP    1831
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64

## Embed with BERT

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, 
                                                    ppb.DistilBertTokenizer, 
                                                    'distilbert-base-uncased')

In [10]:
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [11]:
tokenized = df.linkfree_combined.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512)))

In [12]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [13]:
np.array(padded).shape

(8674, 512)

In [14]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(8674, 512)

In [15]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

In [None]:
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = df.type

## Pass BERT-embedded data into log model for final classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=822)

In [None]:
log_model = LogisticRegression(class_weight='balanced')
log_model.fit(X_train, y_train)

In [None]:
results = log_model.score(X_test, y_test)
print(results)