In [None]:
import json
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import pandas as pd

# Cell 2: Config & Constants
DATA_PATH = Path("data/interim/journals.jsonl")
MODEL_PATH = Path("models/emotion_classifier")
LABELS = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity",
    "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization",
    "relief", "remorse", "sadness", "surprise", "neutral"
]

In [None]:
DATA_PATH = Path("data/interim/journals.jsonl")
MODEL_PATH = Path("models/emotion_classifier")
LABELS = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity",
    "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization",
    "relief", "remorse", "sadness", "surprise", "neutral"
]

# Cell 3: Load and clean dataset
def load_entries(path):
    entries = []
    with open(path, 'r') as f:
        for line in f:
            try:
                row = json.loads(line)
                assert "entry" in row and "emotions" in row
                entries.append(row)
            except: continue
    return entries

entries = load_entries(DATA_PATH)
print(f"Loaded {len(entries)} entries")

In [None]:
def load_entries(path):
    entries = []
    with open(path, 'r') as f:
        for line in f:
            try:
                row = json.loads(line)
                assert "entry" in row and "emotions" in row
                entries.append(row)
            except: continue
    return entries

entries = load_entries(DATA_PATH)
print(f"Loaded {len(entries)} entries")

In [None]:
all_emotions = [e for row in entries for e in row['emotions']]
cnt = Counter(all_emotions)
pd.Series(cnt).sort_values().plot(kind='barh', figsize=(10,8), title='Emotion Label Distribution')
plt.show()

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model.eval();


In [None]:

def predict(texts, model, tokenizer, threshold=0.5):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = torch.sigmoid(logits).numpy()
    return (probs >= threshold).astype(int)

sample_texts = [e['entry'] for e in entries[:100]]
sample_true = [e['emotions'] for e in entries[:100]]

mlb = MultiLabelBinarizer(classes=LABELS)
true_bin = mlb.fit_transform(sample_true)
pred_bin = predict(sample_texts, model, tokenizer)# Cell 7: Evaluation Metrics
print("\nClassification Report (Threshold = 0.5):\n")
print(classification_report(true_bin, pred_bin, target_names=LABELS))

# Optional: Average F1, Precision, Recall
avg_f1 = f1_score(true_bin, pred_bin, average="micro")
avg_precision = precision_score(true_bin, pred_bin, average="micro")
avg_recall = recall_score(true_bin, pred_bin, average="micro")

print(f"\nAvg F1: {avg_f1:.4f} | Precision: {avg_precision:.4f} | Recall: {avg_recall:.4f}")


In [None]:
for i in range(5):
    text = sample_texts[i]
    pred_labels = [LABELS[j] for j in range(len(LABELS)) if pred_bin[i][j] == 1]
    true_labels = sample_true[i]
    print(f"\nEntry {i+1}:")
    print(f"Text: {text[:150]}...")
    print(f"True: {true_labels}\nPred: {pred_labels}")