# 4. Model Building

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

df=pd.read_csv('../assets/raw/essaytrain.csv', encoding='latin1')
traits = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']
df_numeric = df.copy()
X = df_numeric['TEXT']          # input text
y = df_numeric[traits]          # OCEAN traits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Training set size: {X_train.shape[0]} samples')
print(f'Test set size: {X_test.shape[0]} samples')


Training set size: 1184 samples
Test set size: 296 samples


In [2]:
# =========================
# STEP 4: MODEL BUILDING
# Predicting OCEAN personality traits from essay text
# =========================

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

# =========================
# 1️⃣ Load Dataset & Traits
# =========================
df = pd.read_csv('../assets/raw/essaytrain.csv', encoding='latin1')
traits = ['cOPN', 'cCON', 'cEXT', 'cAGR', 'cNEU']

# Convert y/n to 1/0
df_numeric = df.copy()
df_numeric[traits] = df_numeric[traits].replace({'y':1, 'n':0})

# =========================
# 2️⃣ Prepare Features and Target
# =========================
X = df_numeric['TEXT']
y = df_numeric[traits]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =========================
# 3️⃣ TF-IDF Vectorization
# =========================
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# =========================
# 4️⃣ Multi-label Logistic Regression
# =========================
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train_tfidf, y_train)

# =========================
# 5️⃣ Make Predictions
# =========================
y_pred = model.predict(X_test_tfidf)

# =========================
# 6️⃣ Evaluate Model (Trait by Trait)
# =========================
print("=== MODEL EVALUATION ===\n")
for i, trait in enumerate(traits):
    print(f"=== {trait} ===")
    print(classification_report(y_test[trait], y_pred[:, i]))
    print("\n")

# =========================
# ✅ Optional: Save predictions
# =========================
y_pred_df = pd.DataFrame(y_pred, columns=traits)
y_pred_df.to_csv('../assets/processed/predictions_step5.csv', index=False)

print("Predictions saved successfully!")



  df_numeric[traits] = df_numeric[traits].replace({'y':1, 'n':0})


=== MODEL EVALUATION ===

=== cOPN ===
              precision    recall  f1-score   support

           0       0.65      0.56      0.60       155
           1       0.58      0.67      0.62       141

    accuracy                           0.61       296
   macro avg       0.62      0.62      0.61       296
weighted avg       0.62      0.61      0.61       296



=== cCON ===
              precision    recall  f1-score   support

           0       0.54      0.49      0.51       140
           1       0.57      0.62      0.60       156

    accuracy                           0.56       296
   macro avg       0.55      0.55      0.55       296
weighted avg       0.56      0.56      0.56       296



=== cEXT ===
              precision    recall  f1-score   support

           0       0.53      0.52      0.53       143
           1       0.56      0.57      0.56       153

    accuracy                           0.55       296
   macro avg       0.55      0.55      0.55       296
weigh

# Step 4 Summary: Personality Trait Prediction
Using TF-IDF vectorization on essay text, we trained a multi-label Logistic Regression model (MultiOutputClassifier) to predict all five OCEAN traits simultaneously. Each trait’s performance was evaluated individually with precision, recall, and F1-score, and predictions were saved for further analysis. This step enabled accurate trait-level classification from textual essays.