In [1]:
pip uninstall nltk -y


Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import shutil

shutil.rmtree('/root/nltk_data', ignore_errors=True)
shutil.rmtree('/usr/share/nltk_data', ignore_errors=True)
shutil.rmtree('/usr/local/share/nltk_data', ignore_errors=True)
shutil.rmtree('/usr/lib/nltk_data', ignore_errors=True)


In [3]:
%pip install nltk --upgrade
%pip install xgboost


Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [4]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
import nltk

try:
    from nltk.tokenize import word_tokenize
    # Try using word_tokenize to see if the resource is available
    word_tokenize("Hello, this is a test sentence.")
except LookupError:
    # If the resource is not found, download it
    nltk.download('punkt_tab')
    from nltk.tokenize import word_tokenize  # Import it again after downloading
    print(word_tokenize("Hello, this is a test sentence.")) #testing



In [6]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# === Step 1: Download required NLTK resources ===
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# === Step 2: Load dataset ===
ds = pd.read_csv("mbti_1.csv", engine="python")

# One-hot encode the 4 MBTI traits
ds["I_E"] = ds["type"].apply(lambda x: 1 if x[0] == 'I' else 0)
ds["N_S"] = ds["type"].apply(lambda x: 1 if x[1] == 'S' else 0)
ds["T_F"] = ds["type"].apply(lambda x: 1 if x[2] == 'T' else 0)
ds["J_P"] = ds["type"].apply(lambda x: 1 if x[3] == 'J' else 0)

# === Step 3: Clean and preprocess posts ===
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?:\/\/\S+', '', text)
    text = re.sub(r'\|\|\|', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(stemmer.stem(w)) for w in tokens if w not in stop_words and len(w) > 2]
    return ' '.join(tokens)

ds["clean_posts"] = ds["posts"].apply(clean_text)
ds = ds[ds["clean_posts"].notna()]

# === Step 4: Feature extraction with CountVectorizer ===
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(ds["clean_posts"]).toarray()
y = ds[["I_E", "N_S", "T_F", "J_P"]].values

# === Step 5: Train-test split (80:20) ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Step 6: Train and evaluate XGBoost classifiers ===
models = []
trait_names = ["Introvert vs Extrovert", "Sensing vs Intuition", "Thinking vs Feeling", "Perceiving vs Judging"]

for i in range(4):
    model = XGBClassifier()
    model.fit(X_train, y_train[:, i])
    preds = model.predict(X_test)
    acc = accuracy_score(y_test[:, i], preds) * 100
    print(f"{trait_names[i]} Accuracy: {acc:.2f}%")
    models.append(model)

# === Step 7: Save models and vectorizer ===
pickle.dump(models[0], open("xgb_introv.pkl", "wb"))
pickle.dump(models[1], open("xgb_sens.pkl", "wb"))
pickle.dump(models[2], open("xgb_think.pkl", "wb"))
pickle.dump(models[3], open("xgb_perc.pkl", "wb"))
pickle.dump(cv, open("cv.pkl", "wb"))
print("✅ All models and CountVectorizer saved.")

# === Step 8: Final prediction function ===
def final_type(predictions):
    mbti_map = [["E", "I"], ["N", "S"], ["T", "F"], ["J", "P"]]
    return "".join([mbti_map[i][int(p[0])] for i, p in enumerate(predictions)])

def predict_personality(text_input):
    # Load vectorizer and models
    cv_loaded = pickle.load(open("cv.pkl", "rb"))
    model_i = pickle.load(open("xgb_introv.pkl", "rb"))
    model_s = pickle.load(open("xgb_sens.pkl", "rb"))
    model_t = pickle.load(open("xgb_think.pkl", "rb"))
    model_p = pickle.load(open("xgb_perc.pkl", "rb"))

    # Clean and vectorize input
    cleaned = clean_text(text_input)
    vectorized = cv_loaded.transform([cleaned])

    # Predict each trait
    predictions = [
        model_i.predict(vectorized),
        model_s.predict(vectorized),
        model_t.predict(vectorized),
        model_p.predict(vectorized),
    ]
    return final_type(predictions)

# === Step 9: Example usage ===
if __name__ == "__main__":
    test_input = """I love reading and writing about philosophical ideas, and I enjoy spending time alone reflecting."""
    result = predict_personality(test_input)
    print(f"🧠 Predicted MBTI Type: {result}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Introvert vs Extrovert Accuracy: 86.97%
Sensing vs Intuition Accuracy: 91.24%
Thinking vs Feeling Accuracy: 84.50%
Perceiving vs Judging Accuracy: 80.92%
✅ All models and CountVectorizer saved.
🧠 Predicted MBTI Type: ESTJ
