# 5. Results analysis and visualization

In [6]:
# =========================
# STEP 6: MODEL RESULTS ANALYSIS & VISUALIZATION (Saved Plots)
# =========================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# =========================
# Load Dataset
# =========================
df = pd.read_csv('../data/raw/essaytrain.csv', encoding='latin1')
traits = ['cOPN', 'cCON', 'cEXT', 'cAGR', 'cNEU']
df[traits] = df[traits].replace({'y':1, 'n':0})

# =========================
# Prepare Features & Target
# =========================
X = df['TEXT']
y = df[traits]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Multi-label Logistic Regression
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

# =========================
# Create folder for saving plots
# =========================
plots_dir = '../data/results_plots'
os.makedirs(plots_dir, exist_ok=True)

# =========================
# 1️⃣ Per-trait F1-score
# =========================
f1_scores = []
for i, trait in enumerate(traits):
    report = classification_report(y_test[trait], y_pred[:, i], output_dict=True)
    f1 = report['1']['f1-score']
    f1_scores.append(f1)

plt.figure(figsize=(8,5))
sns.barplot(x=traits, y=f1_scores, palette="viridis")
plt.title("Per-Trait F1-Score")
plt.ylabel("F1-Score")
plt.ylim(0,1)
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'f1_scores_per_trait.png'))
plt.close()

# =========================
# 2️⃣ Confusion Matrices
# =========================
for i, trait in enumerate(traits):
    cm = confusion_matrix(y_test[trait], y_pred[:, i])
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - {trait}")
    plt.tight_layout()
    plt.savefig(os.path.join(plots_dir, f'confusion_matrix_{trait}.png'))
    plt.close()

# =========================
# 3️⃣ Top Predictive Words per Trait
# =========================
for i, trait in enumerate(traits):
    coef = model.estimators_[i].coef_[0]
    feature_names = tfidf.get_feature_names_out()
    top_positive_idx = np.argsort(coef)[-10:]
    top_negative_idx = np.argsort(coef)[:10]
    
    print(f"=== {trait} ===")
    print("Top Positive Words:", feature_names[top_positive_idx])
    print("Top Negative Words:", feature_names[top_negative_idx])
    print("\n")

print(f"All plots saved in '{plots_dir}'")


  df[traits] = df[traits].replace({'y':1, 'n':0})

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=traits, y=f1_scores, palette="viridis")


=== cOPN ===
Top Positive Words: ['hmmm' 'cat' 'lack' 'real' 'crazy' 'ones' 'strange' 'place' 'like'
 'maybe']
Top Negative Words: ['going' 'college' 'assignment' 'class' 'game' 'boyfriend' 'feel'
 'tomorrow' 'girls' 'sleep']


=== cCON ===
Top Positive Words: ['hope' 'help' 'great' 'came' 'boyfriend' 'party' 'good' 'type' 'tonight'
 'able']
Top Negative Words: ['want' 'don' 'wake' 'play' 'im' 'point' 'damn' 'work' 'god' 'phone']


=== cEXT ===
Top Positive Words: ['best' 'morning' 'guy' 'party' 'wonder' 'feel' 'mean' 'love' 'realize'
 'boyfriend']
Top Negative Words: ['don' 'cold' 'hair' 'true' 'guess' 'days' 'rest' 'let' 'lonely' 'talk']


=== cAGR ===
Top Positive Words: ['believe' 'great' 'birthday' 'nervous' 'able' 'sister' 'crazy' 'guitar'
 'wonder' 'help']
Top Negative Words: ['don' 'girlfriend' 'read' 'hate' 'truth' 'thinking' 'point' 'trouble'
 'suppose' 'thought']


=== cNEU ===
Top Positive Words: ['moment' 'real' 'money' 'afraid' 'life' 'scared' 'believe' 'wish' 'don'
 'did