# 6. Multiple Models Analysis

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

df = pd.read_csv("../assets/raw/essaytrain.csv", encoding="latin1")

traits = ['cOPN', 'cCON', 'cEXT', 'cAGR', 'cNEU']

df[traits] = df[traits].replace({'y': 1, 'n': 0}).infer_objects(copy=False)

X = df['TEXT']
y = df[traits]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)



  df[traits] = df[traits].replace({'y': 1, 'n': 0}).infer_objects(copy=False)


In [5]:
logreg_model = OneVsRestClassifier(
    LogisticRegression(max_iter=1000)
)

logreg_model.fit(X_train_tfidf, y_train)
y_pred_logreg = logreg_model.predict(X_test_tfidf)


In [6]:
svm_model = OneVsRestClassifier(
    LinearSVC()
)

svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)



In [7]:
nb_model = OneVsRestClassifier(
    MultinomialNB()
)

nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)


In [9]:
import os
import pandas as pd
from sklearn.metrics import f1_score

# Traits
traits = ['cOPN', 'cCON', 'cEXT', 'cAGR', 'cNEU']

# Model predictions dictionary
model_predictions = {
    'Logistic Regression': y_pred_logreg,
    'Linear SVM': y_pred_svm,
    'Naive Bayes': y_pred_nb
}

# Collect results
results = []

for model_name, preds in model_predictions.items():
    for idx, trait in enumerate(traits):
        f1 = f1_score(y_test.iloc[:, idx], preds[:, idx])
        results.append({
            'Model': model_name,
            'Trait': trait,
            'F1_Score': round(f1, 4)
        })

# Create DataFrame
results_df = pd.DataFrame(results)

# Display results
print(results_df)

# =========================
# Save results to assets
# =========================
output_dir = r'C:\Users\Ramphani\OneDrive\Desktop\Personality traits\assets\results'
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, 'model_comparison_f1_scores.csv')
results_df.to_csv(output_path, index=False)

print(f"Model comparison results saved at:\n{output_path}")


                  Model Trait  F1_Score
0   Logistic Regression  cOPN    0.6250
1   Logistic Regression  cCON    0.5969
2   Logistic Regression  cEXT    0.5649
3   Logistic Regression  cAGR    0.6615
4   Logistic Regression  cNEU    0.5528
5            Linear SVM  cOPN    0.5666
6            Linear SVM  cCON    0.5649
7            Linear SVM  cEXT    0.5570
8            Linear SVM  cAGR    0.5917
9            Linear SVM  cNEU    0.5147
10          Naive Bayes  cOPN    0.5594
11          Naive Bayes  cCON    0.6404
12          Naive Bayes  cEXT    0.6275
13          Naive Bayes  cAGR    0.6621
14          Naive Bayes  cNEU    0.6011
Model comparison results saved at:
C:\Users\Ramphani\OneDrive\Desktop\Personality traits\assets\results\model_comparison_f1_scores.csv
