In [22]:
import pandas as pd
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import re
from collections import Counter

df = pd.read_csv("dreams_labeled_balanced.csv")
df

Unnamed: 0,content,label
0,(1960-05-04)I had another neat dream about Blake.,neutral
1,(1963-??-??)I had a Salvador Dali dream where ...,neutral
2,(1964-??-??)A red faced devil comes to me and ...,nightmare
3,(1969-??-??)I woke from a nap unable to rememb...,neutral
4,(1974-01-17)Dreamt of IVs. Lots of waterfalls.,neutral
...,...,...
3556,"(12/7/92)I am some sort of clerk, perhaps a me...",nightmare
3557,(12/2/92)I am watching TV early in the morning...,sad
3558,"(8/3/94)I'm in the library, just arriving (lat...",sad
3559,"[lost - do some stuff, notice the clock, somet...",sad


In [23]:
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from lightgbm import LGBMClassifier

# -------------------- 🧹 Preprocessing --------------------
df = df.dropna(subset=["content", "label"])

def clean_text(text):
    text = re.sub(r"\([^)]*\)", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower().strip()
    return text

df["clean_text"] = df["content"].apply(clean_text)

# 🎯 Encode labels
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])

# 🔀 Split
X = df["clean_text"]
y = df["label_encoded"]
X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train_text, X_val_text, y_train, y_val = train_test_split(X_train_text, y_train, stratify=y_train, test_size=0.1, random_state=42)

# 🔤 Tokenize (for CNN)
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

word2idx = build_vocab(X_train_text)
label2idx = {label: idx for idx, label in enumerate(sorted(df["label"].unique()))}

def encode_text(texts, word2idx, max_len=100):
    encoded = []
    for text in texts:
        tokens = text.split()
        tokens = [word2idx.get(word, word2idx["<UNK>"]) for word in tokens]
        tokens = tokens[:max_len] + [word2idx["<PAD>"]] * max(0, max_len - len(tokens))
        encoded.append(tokens)
    return torch.tensor(encoded)

X_train_tensor = encode_text(X_train_text, word2idx)
X_val_tensor = encode_text(X_val_text, word2idx)
X_test_tensor = encode_text(X_test_text, word2idx)
y_train_tensor = torch.tensor(y_train.values)
y_val_tensor = torch.tensor(y_val.values)
y_test_tensor = torch.tensor(y_test.values)

# 📦 Dataloaders
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=32, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=32)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=32)

# ⚙️ Config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(word2idx)
embed_dim = 128
num_classes = len(label2idx)
pad_idx = word2idx["<PAD>"]

# 🧠 CNN Model
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.conv1 = nn.Conv1d(embed_dim, 100, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(embed_dim, 100, kernel_size=4, padding=2)
        self.conv3 = nn.Conv1d(embed_dim, 100, kernel_size=5, padding=2)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(300, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1, 2)
        x1 = torch.relu(self.conv1(x))
        x2 = torch.relu(self.conv2(x))
        x3 = torch.relu(self.conv3(x))
        x1 = torch.max(x1, dim=2)[0]
        x2 = torch.max(x2, dim=2)[0]
        x3 = torch.max(x3, dim=2)[0]
        x = torch.cat((x1, x2, x3), dim=1)
        x = self.dropout(x)
        return self.fc(x)

# 🧠 Instantiate model
model = TextCNN(vocab_size, embed_dim, num_classes, pad_idx).to(device)

# 📉 Loss & Optimizer
class_counts = np.bincount(y_train)
weights = torch.tensor(1.0 / class_counts, dtype=torch.float32).to(device)
loss_fn = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 🏋️ Training Loop
def train_model(model, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_loss, correct = 0, 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            loss = loss_fn(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            correct += (out.argmax(1) == y).sum().item()
        acc = correct / len(train_loader.dataset)
        print(f"Epoch {epoch+1} | 🧠 Loss: {total_loss:.4f} | ✅ Train Accuracy: {acc:.4f}")

# 🚀 Train CNN
train_model(model)

# 🔍 CNN prediction probs
def get_probs(loader):
    model.eval()
    all_probs = []
    with torch.no_grad():
        for x, _ in loader:
            x = x.to(device)
            probs = torch.softmax(model(x), dim=1)
            all_probs.append(probs.cpu())
    return torch.cat(all_probs).numpy()

cnn_val_probs = get_probs(val_loader)
cnn_test_probs = get_probs(test_loader)

# 📚 TF-IDF + Naive Bayes
nb_pipeline = make_pipeline(TfidfVectorizer(max_features=5000), MultinomialNB())
nb_pipeline.fit(X_train_text, y_train)
nb_val_probs = nb_pipeline.predict_proba(X_val_text)
nb_test_probs = nb_pipeline.predict_proba(X_test_text)

# 📚 TF-IDF + LightGBM
tfidf = TfidfVectorizer(max_features=5000)
X_train_vec = tfidf.fit_transform(X_train_text)
X_val_vec = tfidf.transform(X_val_text)
X_test_vec = tfidf.transform(X_test_text)

lgb_model = LGBMClassifier()
lgb_model.fit(X_train_vec, y_train)
lgb_val_probs = lgb_model.predict_proba(X_val_vec)
lgb_test_probs = lgb_model.predict_proba(X_test_vec)

# 🔀 Combine CNN, NB, LightGBM for Blending
X_blend_val = np.hstack([cnn_val_probs, nb_val_probs, lgb_val_probs])
X_blend_test = np.hstack([cnn_test_probs, nb_test_probs, lgb_test_probs])

# ⚙️ Logistic Regression Blender
log_blender = LogisticRegression(max_iter=1000)
log_blender.fit(X_blend_val, y_val)
log_preds = log_blender.predict(X_blend_test)

print("\n📊 Logistic Regression Blending (CNN + NB + LGBM) - Test Results:")
print("Accuracy:", accuracy_score(y_test, log_preds))
print(classification_report(y_test, log_preds, target_names=label_encoder.classes_))


Epoch 1 | 🧠 Loss: 116.4273 | ✅ Train Accuracy: 0.3414
Epoch 2 | 🧠 Loss: 89.0520 | ✅ Train Accuracy: 0.5478
Epoch 3 | 🧠 Loss: 72.0490 | ✅ Train Accuracy: 0.6387
Epoch 4 | 🧠 Loss: 62.3478 | ✅ Train Accuracy: 0.6992
Epoch 5 | 🧠 Loss: 51.6077 | ✅ Train Accuracy: 0.7530
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 58419
[LightGBM] [Info] Number of data points in the train set: 2563, number of used features: 1619
[LightGBM] [Info] Start training from score -1.269683
[LightGBM] [Info] Start training from score -1.269683
[LightGBM] [Info] Start training from score -1.589352
[LightGBM] [Info] Start training from score -1.452004

📊 Logistic Regression Blending (CNN + NB + LGBM) - Test Results:
Accuracy: 0.9130434782608695
              precision    recall  f1-score   support

       happy       0.88      0.87      0.87       200
     neutral       0.88



In [24]:
import joblib

# 🎯 Save encoders
joblib.dump(label_encoder, "label_encoder.pkl")
joblib.dump(word2idx, "word2idx.pkl")

# 🧠 Save CNN model weights
torch.save(model.state_dict(), "cnn_model.pt")

# 📘 Save other models
joblib.dump(nb_pipeline, "nb_pipeline.pkl")
joblib.dump(tfidf, "tfidf.pkl")
joblib.dump(lgb_model, "lgb_model.pkl")
joblib.dump(log_blender, "log_blender.pkl")


['log_blender.pkl']

In [25]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import json

# 🎯 Predicted emotions (decoded)
predicted_labels = label_encoder.inverse_transform(log_preds)

# 🔢 Get probabilities from final ensemble (blender)
final_probs = log_blender.predict_proba(X_blend_test)

# 🌌 UMAP/t-SNE embeddings
# Using t-SNE here for simplicity (you can use UMAP if you prefer)
tsne = TSNE(n_components=2, random_state=42)
tsne_coords = tsne.fit_transform(X_blend_test)

# 🧭 Optional Clustering
kmeans = KMeans(n_clusters=len(label_encoder.classes_), random_state=42)
cluster_labels = kmeans.fit_predict(tsne_coords)

# 📝 Combine all into a DataFrame
result_df = pd.DataFrame({
    "text": X_test_text.values,
    "true_label": label_encoder.inverse_transform(y_test),
    "predicted_label": predicted_labels,
    "cluster": cluster_labels,
    "tsne_x": tsne_coords[:, 0],
    "tsne_y": tsne_coords[:, 1]
})

# Add probabilities as separate columns
for i, class_name in enumerate(label_encoder.classes_):
    result_df[f"prob_{class_name}"] = final_probs[:, i]

# 💾 Save results
result_df.to_csv("final_predictions_with_tsne.csv", index=False)
result_df.to_json("final_predictions_with_tsne.json", orient="records", lines=True)

print("\n✅ Results saved as 'final_predictions_with_tsne.csv' and '.json'")



✅ Results saved as 'final_predictions_with_tsne.csv' and '.json'
