In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [8]:
MBTI = pd.read_csv('MBTI_balanced.csv')
MBTI.head()

Unnamed: 0.1,Unnamed: 0,posts,type
0,0,laptop barrier freeze counsel sibling yay istp...,ENFJ
1,1,deem rant grateful whats yay butt sarcasm uh b...,ENFJ
2,2,lover gut sibling aww cash usa submit istp ove...,ENFJ
3,3,yo radio gut sx cue harmony reminder doms rout...,ENFJ
4,4,partly sacrifice gf sarcasm attachment escape ...,ENFJ


In [9]:
X = MBTI['posts'] # features
y = MBTI['type']  # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
vectorizer = TfidfVectorizer(analyzer='word', max_features=3000, lowercase=False)
X_train_tfidf = vectorizer.fit_transform(X_train.apply(lambda x: np.str_(x)))

In [11]:
from sklearn.metrics import classification_report

text_clf_log = Pipeline([('tfidf',TfidfVectorizer()),('clf',LogisticRegression(max_iter = 1000))])
text_clf_log.fit(X_train.apply(lambda x: np.str_(x)), y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('clf', LogisticRegression(max_iter=1000))])

In [12]:
predictions = text_clf_log.predict(X_test.apply(lambda x: np.str_(x)))
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        ENFJ       0.98      0.98      0.98      5091
        ENFP       0.91      0.90      0.90      4978
        ENTJ       0.97      0.96      0.97      4909
        ENTP       0.89      0.87      0.88      4971
        ESFJ       1.00      1.00      1.00      4953
        ESFP       1.00      1.00      1.00      4976
        ESTJ       1.00      1.00      1.00      4934
        ESTP       0.99      0.99      0.99      4984
        INFJ       0.86      0.84      0.85      5111
        INFP       0.85      0.84      0.85      5018
        INTJ       0.81      0.82      0.82      4902
        INTP       0.80      0.84      0.82      5088
        ISFJ       0.99      0.99      0.99      5048
        ISFP       0.98      0.99      0.98      5039
        ISTJ       0.99      0.99      0.99      4939
        ISTP       0.98      0.96      0.97      4935

    accuracy                           0.94     79876
   macro avg       0.94   