In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report



In [13]:
# Step 1: Load Dataset
mbti_df = pd.read_csv("D:/MBTI minor project/TEST_TEST/filename_aug_dataset.csv")
mbti_df.head()
mbti_df.Keyword[0]
mbti_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Type     32000 non-null  object
 1   Keyword  32000 non-null  object
dtypes: object(2)
memory usage: 500.1+ KB


In [14]:
# Step 2: Encode labels (MBTI types)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["Type"])



In [15]:
# Step 3: Vectorize Keywords using TF-IDF with n-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # unigrams + bigrams
X = vectorizer.fit_transform(df["Keyword"])



In [16]:
# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



In [17]:
# Step 5: Define models
models = {
    "RandomForest": RandomForestClassifier(),
    "NaiveBayes": MultinomialNB(),
    "SVM": LinearSVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
}



In [18]:
# Step 6: Train and Evaluate Each Model
for name, model in models.items():
    print(f"\nTraining: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    
    print(f"{name} Classification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Training: RandomForest
RandomForest Accuracy: 0.6150
RandomForest Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.68      0.66      0.67       400
        ENFP       0.75      0.49      0.59       400
        ENTJ       0.64      0.61      0.62       400
        ENTP       0.69      0.69      0.69       400
        ESFJ       0.51      0.65      0.57       400
        ESFP       0.56      0.69      0.62       400
        ESTJ       0.59      0.52      0.55       400
        ESTP       0.64      0.68      0.66       400
        INFJ       0.83      0.64      0.72       400
        INFP       0.77      0.59      0.67       400
        INTJ       0.53      0.57      0.55       400
        INTP       0.60      0.67      0.63       400
        ISFJ       0.64      0.59      0.61       400
        ISFP       0.57      0.60      0.59       400
        ISTJ       0.51      0.66      0.57       400
        ISTP       0.57      0.53      0.55  

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.6150
XGBoost Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.70      0.64      0.66       400
        ENFP       0.70      0.54      0.61       400
        ENTJ       0.64      0.63      0.64       400
        ENTP       0.72      0.69      0.70       400
        ESFJ       0.50      0.61      0.55       400
        ESFP       0.57      0.66      0.61       400
        ESTJ       0.59      0.52      0.55       400
        ESTP       0.65      0.68      0.67       400
        INFJ       0.86      0.62      0.72       400
        INFP       0.75      0.60      0.67       400
        INTJ       0.53      0.57      0.55       400
        INTP       0.60      0.67      0.63       400
        ISFJ       0.63      0.60      0.61       400
        ISFP       0.53      0.61      0.57       400
        ISTJ       0.51      0.67      0.58       400
        ISTP       0.57      0.54      0.55       400

    accuracy            

In [19]:
import joblib

# Save your best model and vectorizer
joblib.dump(models["XGBoost"], "mbti_model_xgboost.pkl")
joblib.dump(vectorizer, "mbti_vectorizer.pkl")
joblib.dump(label_encoder, "mbti_label_encoder.pkl")

['mbti_label_encoder.pkl']