### Imports

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
!pip list > requirements.txt

### Checking GPU

In [7]:
!nvidia-smi

Thu Oct 16 06:36:52 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 577.03                 Driver Version: 577.03         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   53C    P8              3W /   70W |       0MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [8]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


CUDA available: True
GPU name: NVIDIA GeForce RTX 3050 Laptop GPU


### Loading dataset

In [9]:
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")

texts = dataset["train"]["text"]
labels = dataset["train"]["labels"]

In [10]:
import numpy as np
glove_path='./dolma_300_2024_1.2M.100_combined.txt'
# Load embeddings into a dictionary
glove_model = {}
with open(glove_path, 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_model[word] = vector

print(f"Loaded {len(glove_model)} word vectors.")

Loaded 1200001 word vectors.


### Label preprocessing

In [11]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels)
num_classes = len(mlb.classes_)
print("Number of classes:", num_classes)

Number of classes: 28


### Tokenization + Vocabulary

In [12]:
tokenized = [word_tokenize(t.lower()) for t in texts]

# Build vocabulary
all_words = [word for sent in tokenized for word in sent]
word_freq = Counter(all_words)
vocab = {w: i + 2 for i, (w, _) in enumerate(word_freq.most_common(20000))}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

def encode_text(tokens, max_len=40):
    ids = [vocab.get(t, 1) for t in tokens[:max_len]]
    ids += [0] * (max_len - len(ids))
    return ids

encoded_texts = [encode_text(t) for t in tokenized]
X = torch.tensor(encoded_texts)
Y = torch.tensor(y, dtype=torch.float32)

### Dataset + Dataloader

In [15]:
class EmotionDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(EmotionDataset(X_train, y_train), batch_size=64, shuffle=True)
val_loader = DataLoader(EmotionDataset(X_val, y_val), batch_size=64)


embedding_dim = 300
vocab_size = len(vocab)

# Initialize embedding matrix with zeros
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Fill matrix with GloVe vectors
for word, idx in vocab.items():
    if word in glove_model:
        embedding_matrix[idx] = glove_model[word]
    else:
        # If word not in GloVe, leave it as zeros (or random)
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.1, random_state=42)

### Model Building

In [16]:
class LSTMEmotionClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, embedding_matrix):
        super().__init__()
        # Load pretrained embeddings
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32), 
            freeze=False  # set True to freeze embeddings
        )
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        emb = self.embedding(x)
        _, (h_n, _) = self.lstm(emb)
        # Concatenate final states from both directions
        out = self.fc(torch.cat((h_n[-2], h_n[-1]), dim=1))
        return self.sigmoid(out)

### Training Config

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

embedding_dim = 300  # must match glove embeddings
hidden_dim = 256

model = LSTMEmotionClassifier(
    vocab_size=len(vocab),
    embed_dim=embedding_dim,
    hidden_dim=hidden_dim,
    num_classes=num_classes,
    embedding_matrix=embedding_matrix
).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def multi_label_accuracy(y_pred, y_true):
    preds = (y_pred > 0.5).float()
    correct = (preds == y_true).float().sum()
    total = y_true.numel()
    return (correct / total).item()


Using device: cuda


### Training Loop with epochs

In [18]:
num_epochs = 15

for epoch in range(num_epochs):
    model.train()
    total_loss, total_acc = 0, 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_acc += multi_label_accuracy(outputs, y_batch)

    avg_train_loss = total_loss / len(train_loader)
    avg_train_acc = total_acc / len(train_loader)

    # ===========================
    # Validation
    # ===========================
    model.eval()
    val_loss, val_acc = 0, 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
            val_acc += multi_label_accuracy(outputs, y_batch)

    avg_val_loss = val_loss / len(val_loader)
    avg_val_acc = val_acc / len(val_loader)

    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc*100:.2f}%")
    print(f"Val Loss:   {avg_val_loss:.4f}, Val Acc:   {avg_val_acc*100:.2f}%")
    print("-" * 60)

Epoch [1/15]
Train Loss: 0.1384, Train Acc: 95.94%
Val Loss:   0.1155, Val Acc:   96.49%
------------------------------------------------------------
Epoch [2/15]
Train Loss: 0.1054, Train Acc: 96.61%
Val Loss:   0.0999, Val Acc:   96.68%
------------------------------------------------------------
Epoch [2/15]
Train Loss: 0.1054, Train Acc: 96.61%
Val Loss:   0.0999, Val Acc:   96.68%
------------------------------------------------------------
Epoch [3/15]
Train Loss: 0.0919, Train Acc: 96.87%
Val Loss:   0.0972, Val Acc:   96.67%
------------------------------------------------------------
Epoch [3/15]
Train Loss: 0.0919, Train Acc: 96.87%
Val Loss:   0.0972, Val Acc:   96.67%
------------------------------------------------------------
Epoch [4/15]
Train Loss: 0.0823, Train Acc: 97.13%
Val Loss:   0.0962, Val Acc:   96.70%
------------------------------------------------------------
Epoch [4/15]
Train Loss: 0.0823, Train Acc: 97.13%
Val Loss:   0.0962, Val Acc:   96.70%
-----------

In [19]:
# model.eval()

# def predict_emotions(text, tokenizer, model, device, label_names):
#     # Tokenize single text
#     encoding = tokenizer(
#         text,
#         return_tensors='pt',
#         truncation=True,
#         padding='max_length',
#         max_length=128
#     )

#     # Move to GPU/CPU
#     input_ids = encoding['input_ids'].to(device)
#     attention_mask = encoding['attention_mask'].to(device)

#     # Run through model
#     with torch.no_grad():
#         outputs = model(input_ids, attention_mask)
#         probs = sigmoid(outputs).cpu().numpy()[0]

#     # Convert probabilities to label names (threshold = 0.5)
#     threshold = 0.5
#     predicted_labels = [label_names[i] for i, p in enumerate(probs) if p >= threshold]
#     return predicted_labels, probs


import torch
import numpy as np

# Ensure model is in eval mode
model.eval()

# 🔹 Function to encode and predict emotions for a single text
def predict_emotion_lstm(text, model, vocab, device, label_names):
    # Tokenize + numericalize
    tokens = [vocab.get(word.lower(), vocab['<UNK>']) for word in text.split()]
    # Pad/truncate
    max_len = 50
    if len(tokens) < max_len:
        tokens += [vocab['<PAD>']] * (max_len - len(tokens))
    else:
        tokens = tokens[:max_len]

    # Convert to tensor
    x = torch.tensor(tokens).unsqueeze(0).to(device)

    # Forward pass
    with torch.no_grad():
        probs = model(x).cpu().numpy()[0]

    # Apply threshold
    threshold = 0.5
    predicted_labels = [label_names[i] for i, p in enumerate(probs) if p >= threshold]

    return predicted_labels, probs


In [20]:
label_names = [
    "admiration", "amusement", "anger", "annoyance", "approval",
    "caring", "confusion", "curiosity", "desire", "disappointment",
    "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness", "optimism",
    "pride", "realization", "relief", "remorse", "sadness",
    "surprise", "neutral"
]


In [21]:
test_texts = [
    "I am so happy and grateful for your help!",
    "This is the worst movie I’ve ever seen.",
    "I can’t believe I actually won the competition!",
    "I feel nervous about my job interview tomorrow.",
    "Wow, that performance was absolutely amazing!",
    "I’m really disappointed with your behavior.",
    "I miss you so much it hurts.",
    "What a confusing tutorial... I don’t get it.",
    "I’m proud of what I’ve achieved today.",
    "Nothing special happened today, just a normal day."
]

for text in test_texts:
    labels, probs = predict_emotion_lstm(text, model, vocab, device, label_names)
    print(f"\nText: {text}")
    print(f"Predicted Emotions: {labels}")



Text: I am so happy and grateful for your help!
Predicted Emotions: ['gratitude', 'joy']

Text: This is the worst movie I’ve ever seen.
Predicted Emotions: ['disgust']

Text: I can’t believe I actually won the competition!
Predicted Emotions: ['approval']

Text: I feel nervous about my job interview tomorrow.
Predicted Emotions: ['nervousness']

Text: Wow, that performance was absolutely amazing!
Predicted Emotions: ['neutral']

Text: I’m really disappointed with your behavior.
Predicted Emotions: ['disappointment']

Text: I miss you so much it hurts.
Predicted Emotions: ['sadness']

Text: What a confusing tutorial... I don’t get it.
Predicted Emotions: ['confusion']

Text: I’m proud of what I’ve achieved today.
Predicted Emotions: ['pride']

Text: Nothing special happened today, just a normal day.
Predicted Emotions: ['neutral']


In [1]:
# Save the model, vocabulary and other necessary components
import os
import torch
import json

# Create a directory for model files if it doesn't exist
os.makedirs('model_files', exist_ok=True)

# Save the model state dictionary
torch.save(model.state_dict(), 'emotion_model.pth')
print("Model saved to emotion_model.pth")

# Save the vocabulary
with open('vocab.txt', 'w', encoding='utf-8') as f:
    json.dump(vocab, f, ensure_ascii=False, indent=2)
print("Vocabulary saved to vocab.txt")

# Create a helper module in emotion_model.py for easy import
with open('emotion_model.py', 'w', encoding='utf-8') as f:
    f.write("""import torch
import torch.nn as nn
import json
import numpy as np

# Define the model architecture
class LSTMEmotionClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, embedding_matrix=None):
        super().__init__()
        # Initialize embedding layer
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(
                torch.tensor(embedding_matrix, dtype=torch.float32), 
                freeze=False
            )
        else:
            # If no pretrained embeddings, initialize randomly
            self.embedding = nn.Embedding(vocab_size, embed_dim)
            
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        emb = self.embedding(x)
        _, (h_n, _) = self.lstm(emb)
        # Concatenate final states from both directions
        out = self.fc(torch.cat((h_n[-2], h_n[-1]), dim=1))
        return self.sigmoid(out)

# List of emotion labels
emotion_labels = [
    "admiration", "amusement", "anger", "annoyance", "approval",
    "caring", "confusion", "curiosity", "desire", "disappointment",
    "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness", "optimism",
    "pride", "realization", "relief", "remorse", "sadness",
    "surprise", "neutral"
]

# Function to load the vocabulary
def load_vocab(vocab_file='vocab.txt'):
    try:
        with open(vocab_file, 'r', encoding='utf-8') as f:
            vocab = json.load(f)
        return vocab
    except Exception as e:
        print(f"Error loading vocabulary: {e}")
        return None

# Function to load the model
def load_model(model_path='emotion_model.pth', vocab_file='vocab.txt'):
    try:
        # Load vocabulary
        vocab = load_vocab(vocab_file)
        if vocab is None:
            return None, None
            
        # Determine device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Initialize model
        embedding_dim = 300
        hidden_dim = 256
        num_classes = len(emotion_labels)
        
        model = LSTMEmotionClassifier(
            vocab_size=len(vocab),
            embed_dim=embedding_dim,
            hidden_dim=hidden_dim,
            num_classes=num_classes
        ).to(device)
        
        # Load model weights
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.eval()
        
        return model, vocab, device
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None, torch.device("cpu")

# Function to predict emotions from text
def predict_emotion(text, model=None, vocab=None, device=None):
    if model is None or vocab is None:
        model, vocab, device = load_model()
        
    if model is None or vocab is None:
        return [{"label": "neutral", "score": 1.0}]
    
    # Tokenize and encode text
    tokens = [vocab.get(word.lower(), vocab['<UNK>']) for word in text.split()]
    
    # Pad/truncate
    max_len = 50
    if len(tokens) < max_len:
        tokens += [vocab['<PAD>']] * (max_len - len(tokens))
    else:
        tokens = tokens[:max_len]

    # Convert to tensor and predict
    try:
        x = torch.tensor(tokens).unsqueeze(0).to(device)
        
        # Forward pass
        with torch.no_grad():
            probs = model(x).cpu().numpy()[0]
        
        # Format results in the style expected by integrated_cbt
        results = []
        threshold = 0.5
        for i, label in enumerate(emotion_labels):
            if probs[i] >= threshold:
                results.append({"label": label, "score": float(probs[i])})
        
        # If no emotions above threshold, return neutral
        if not results:
            results.append({"label": "neutral", "score": 1.0})
            
        # Sort by score (descending)
        results = sorted(results, key=lambda x: x["score"], reverse=True)
        
        return results
    except Exception as e:
        print(f"Error in prediction: {e}")
        return [{"label": "neutral", "score": 1.0}]
""")

print("Created emotion_model.py with all necessary functions for integration")

# Test the emotion_model.py module we just created
print("\nTesting the emotion_model module...")
import importlib
import emotion_model
importlib.reload(emotion_model)  # Force reload in case it was imported before

# Test loading the model
try:
    model_reload, vocab_reload, device_reload = emotion_model.load_model('emotion_model.pth', 'vocab.txt')
    print("✅ Successfully loaded model from file")
    
    # Test a prediction
    test_text = "I am feeling really happy today!"
    results = emotion_model.predict_emotion(test_text, model_reload, vocab_reload, device_reload)
    print(f"\nTest text: {test_text}")
    print("Detected emotions:")
    for emotion in results:
        print(f"  - {emotion['label']}: {emotion['score']:.4f}")
except Exception as e:
    print(f"❌ Error testing the module: {e}")

NameError: name 'model' is not defined

In [25]:
# Save the model in a format compatible with integrated_cbt_streamlit.py
import torch

# Save model in the format expected by integrated_cbt_streamlit.py
model_data = {
    "model_state_dict": model.state_dict(),
    "embedding_dim": embedding_dim,
    "hidden_dim": hidden_dim,
    "num_classes": num_classes,
    "vocab_size": len(vocab)
}
torch.save(model_data, "emotion_model.pth")
print("Model saved successfully with model_state_dict format")

# Save vocabulary as JSON if not already done
import json
with open("vocab.txt", "w", encoding='utf-8') as f:
    json.dump(vocab, f, ensure_ascii=False, indent=2)
print("Vocabulary saved to vocab.txt")

# You can now use the save_model.py script if needed to re-save the model
print("\nYou can also run save_model.py to format the model correctly for integration")
print("This script will ensure the model is saved in the right format for integrated_cbt_streamlit.py")

Model saved successfully with model_state_dict format
Vocabulary saved to vocab.txt

You can also run save_model.py to format the model correctly for integration
This script will ensure the model is saved in the right format for integrated_cbt_streamlit.py
