In [10]:
import snscrape.modules.twitter as sntwitter

def get_tweets(username, num_tweets=200):
    tweets = []
    for i, tweet in enumerate(sntwitter.TwitterUserScraper(username).get_items()):
        if i >= num_tweets:
            break
        tweets.append(tweet.content)
    return " ".join(tweets)


In [11]:
import pandas as pd

df = pd.read_csv('mbti_1.csv')
df['clean_text'] = df['posts'].str.lower()


In [12]:
import re

df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r"http\S+|www\S+|https\S+", '', x))
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(r"[^a-z\s]", '', x))


In [13]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
df['clean_text'] = df['clean_text'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)

mbti_types = ['infj','intj','intp','infp','entp','enfp','entj','enfj',
              'istj','isfj','estj','esfj','istp','isfp','estp','esfp']
df['clean_text'] = df['clean_text'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in mbti_types])
)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/agamsingh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
labels = df['type']
texts = df['clean_text']

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Evaluate
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        ENFJ       0.60      0.08      0.14        38
        ENFP       0.57      0.32      0.41       135
        ENTJ       0.25      0.02      0.04        46
        ENTP       0.51      0.32      0.39       137
        ESFJ       0.00      0.00      0.00         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       0.00      0.00      0.00         8
        ESTP       0.00      0.00      0.00        18
        INFJ       0.49      0.57      0.53       294
        INFP       0.45      0.75      0.56       366
        INTJ       0.48      0.50      0.49       218
        INTP       0.51      0.70      0.59       261
        ISFJ       1.00      0.03      0.06        33
        ISFP       0.83      0.09      0.17        54
        ISTJ       0.50      0.02      0.05        41
        ISTP       0.74      0.21      0.33        67

    accuracy                           0.49      1735
   macro avg       0.43   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
def predict_personality(text):
    cleaned = clean_text(text)  # use the same cleaning steps
    vec = vectorizer.transform([cleaned])
    return model.predict(vec)[0]
