In [None]:
import os
import json
import gc
import pickle
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, f1_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import torch
from transformers import RobertaTokenizer, RobertaModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Bidirectional, LSTM, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

# ---- Load Dataset ----
with open("combined_dataset.json", "r") as f:
    parsed_data = [json.loads(line) for line in f if line.strip()]
contexts = [sample["Context"] for sample in parsed_data]

# ---- Load Mood Keywords ----
with open("mood_keywords.json", "r", encoding="utf-8") as f:
    mood_keywords = json.load(f)

def detect_mood(text):
    text = text.lower()
    for mood, keywords in mood_keywords.items():
        if any(keyword in text for keyword in keywords):
            return mood
    return None

X_texts, Y_labels = [], []
for context in contexts:
    mood = detect_mood(context)
    if mood:
        X_texts.append(context)
        Y_labels.append(mood)

# ---- Encode Labels (initial) ----
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y_labels)
with open("mood_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

label_counts = Counter(Y_encoded)
valid_labels = {label for label, count in label_counts.items() if count > 1}
filtered = [(x, y) for x, y in zip(X_texts, Y_encoded) if y in valid_labels]
X_texts_filtered = [x for x, _ in filtered]
Y_filtered = [y for _, y in filtered]

# ---- RoBERTa Token Embeddings ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaModel.from_pretrained("roberta-base").to(device)
roberta_model.eval()

max_length = 128

def roberta_embed_token_embeddings(texts, batch_size=16, max_length=128):
    all_embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            encoded = tokenizer(batch, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
            encoded = {k: v.to(device) for k, v in encoded.items()}
            output = roberta_model(**encoded)
            embeddings = output.last_hidden_state.cpu().numpy()
            all_embeddings.append(embeddings)
    return np.vstack(all_embeddings)

embedding_file = "roberta_token_embeddings.npy"
texts_file = "roberta_embedding_texts.pkl"
labels_file = "roberta_embedding_labels.pkl"

if os.path.exists(embedding_file) and os.path.exists(texts_file) and os.path.exists(labels_file):
    print("📂 Loading existing RoBERTa token embeddings...")
    X_emb = np.load(embedding_file)
    with open(texts_file, "rb") as f:
        saved_texts = pickle.load(f)
    with open(labels_file, "rb") as f:
        saved_labels = pickle.load(f)
    if len(saved_texts) == len(X_texts_filtered) and len(saved_labels) == len(Y_filtered):
        print("✅ Loaded existing embeddings successfully!")
    else:
        print("⚠️ Saved data mismatch, regenerating embeddings...")
        X_emb = roberta_embed_token_embeddings(X_texts_filtered, max_length=max_length)
        np.save(embedding_file, X_emb)
        with open(texts_file, "wb") as f:
            pickle.dump(X_texts_filtered, f)
        with open(labels_file, "wb") as f:
            pickle.dump(Y_filtered, f)
else:
    print("🔄 Generating RoBERTa token embeddings...")
    X_emb = roberta_embed_token_embeddings(X_texts_filtered, max_length=max_length)
    print("💾 Saving RoBERTa token embeddings...")
    np.save(embedding_file, X_emb)
    with open(texts_file, "wb") as f:
        pickle.dump(X_texts_filtered, f)
    with open(labels_file, "wb") as f:
        pickle.dump(Y_filtered, f)

print(f"Token embedding shape: {X_emb.shape}")

# ---- Resampling with SMOTE after PCA ----
min_samples = min(Counter(Y_filtered).values())

if min_samples < 2:
    print("⚠️ SMOTE skipped due to insufficient samples.")
    X_resampled = X_emb
    Y_resampled_encoded = np.array(Y_filtered)
else:
    # Flatten then reduce dimensionality with PCA
    X_flat = X_emb.reshape(X_emb.shape[0], -1)  # (samples, 128*768)
    print("🔽 Applying PCA to reduce dimensions before SMOTE...")
    pca = PCA(n_components=500, random_state=42)
    X_pca = pca.fit_transform(X_flat)  # (samples, 500)
    
    smote_k = max(1, min(5, min_samples - 1))
    smote = SMOTE(k_neighbors=smote_k, random_state=42)
    X_resampled_pca, Y_resampled_encoded = smote.fit_resample(X_pca, Y_filtered)

    # Optionally map PCA output back to 3D for LSTM input
    X_resampled = np.reshape(X_resampled_pca, (X_resampled_pca.shape[0], 10, 50))  # e.g., 10 timesteps of 50 dims

# ---- Re-encode labels to zero-based continuous indices ----
print("Labels before re-encoding after SMOTE:", np.unique(Y_resampled_encoded))
le = LabelEncoder()
Y_resampled_encoded_zero_based = le.fit_transform(Y_resampled_encoded)
num_classes = len(le.classes_)
print("Labels after re-encoding:", np.unique(Y_resampled_encoded_zero_based))
print("Number of classes:", num_classes)

# ---- One-hot encode labels ----
Y_resampled = to_categorical(Y_resampled_encoded_zero_based, num_classes=num_classes)

# ---- Train/Test Split ----
X_train, X_test, Y_train, Y_test = train_test_split(
    X_resampled, Y_resampled,
    test_size=0.25,
    stratify=Y_resampled_encoded_zero_based,
    random_state=42
)

Y_train_int = np.argmax(Y_train, axis=1)
Y_test_int = np.argmax(Y_test, axis=1)

# ---- Compute class weights ----
class_weights = compute_class_weight(
    "balanced",
    classes=np.unique(Y_train_int),
    y=Y_train_int
)
class_weight_dict = dict(enumerate(class_weights))
print("Class weights:", class_weight_dict)

# ---- Model Builder ----
def build_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    x = Bidirectional(LSTM(128, return_sequences=True))(inputs)
    x = GlobalMaxPooling1D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = BatchNormalization()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    outputs = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

# ---- Train Model ----
gc.collect()
model = build_model(input_shape=X_train.shape[1:], num_classes=num_classes)

early_stop = EarlyStopping(monitor="val_loss", patience=9, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.8, patience=7, min_lr=1e-6, verbose=1)

print("Training data size:", len(X_train))
print("Validation data size:", int(len(X_train)*0.1))
print("Test data size:", len(X_test))

history = model.fit(
    X_train, Y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop, reduce_lr],
    class_weight=class_weight_dict,
    verbose=1
)

# ---- Evaluate ----
loss, acc = model.evaluate(X_test, Y_test, verbose=0)
print(f"✅ Test Loss: {loss:.4f} - Test Accuracy: {acc*100:.2f}%")

Y_pred_prob = model.predict(X_test)
Y_pred = np.argmax(Y_pred_prob, axis=1)


In [None]:
pip install stanza

In [None]:
from tensorflow.keras.models import Model,save_model
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

# ---- Plot Results ----
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.xlabel('Epochs'); plt.ylabel('Accuracy'); plt.title('Accuracy'); plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epochs'); plt.ylabel('Loss'); plt.title('Loss'); plt.legend()
plt.tight_layout()
plt.show()

# ---- F1 Score ----
f1 = f1_score(Y_test_int, Y_pred, average="weighted")
print(f"🎯 Weighted F1 Score: {f1 * 100:.2f}%")

# ---- Save Model ----
from tensorflow.keras.models import save_model
model_path = "moods_classifier_bilstm_model.keras"

# Remove old model if exists
if os.path.exists(model_path):
    try:
        if os.path.isfile(model_path):
            os.remove(model_path)
        else:
            import shutil
            shutil.rmtree(model_path)
        print(f"🧹 Removed old model: {model_path}")
    except Exception as e:
        print(f"❌ Could not delete old model: {e}")
        raise

# Save model
try:
    save_model(model, model_path)
    print(f"✅ Model saved in native Keras format at: {model_path}")
except Exception as e:
    print(f"❌ Error saving model: {e}")


In [None]:
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("GPU Devices:", tf.config.list_physical_devices('GPU'))

In [None]:
# === Imports and Setup ===
import pandas as pd
import numpy as np
import pickle
import os
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from tensorflow.keras.utils import to_categorical
import torch
from transformers import RobertaTokenizer, RobertaModel
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, LSTM, Bidirectional, Layer
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.regularizers import l2
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import json

# === Load and Clean Dataset ===
df = pd.read_csv("final_cleaned_dataset.csv")
texts = df["question_text"].fillna("").astype(str).tolist()
labels = df["topics"].fillna("").astype(str).str.strip().str.lower().tolist() 

# Ensure labels are strings and handle potential None/NaN if not done by fillna
labels = [str(label).strip() for label in labels] # Added .strip() to remove leading/trailing whitespace

# Load categories from cleaned_questions.json to ensure consistency
# === Load categories from cleaned_questions.json to ensure consistency ===
# === Load categories from cleaned_questions.json ===
def get_categories_from_json(file_path="cleaned_questions.json"):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        categories = set()
        for entry in data:
            category_name = entry.get("category")
            if category_name:
                categories.add(str(category_name).strip().lower())  # Normalize
        return sorted(list(categories))
    except FileNotFoundError:
        print(f"Error: cleaned_questions.json not found at '{file_path}'")
        return []
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from '{file_path}'. Check file format.")
        return []

expected_categories = get_categories_from_json("cleaned_questions.json")

# === Mapping function ===
def map_to_general_category(label_string):
    topics = str(label_string).lower().split(",")
    topics = [t.strip() for t in topics]
    if any(t in topics for t in ["depression", "anxiety"]):
        return "depression & anxiety"
    elif any(t in topics for t in ["stress", "coping"]):
        return "stress & coping"
    elif any(t in topics for t in ["trauma", "ptsd"]):
        return "trauma & ptsd"
    else:
        return "personality & behaviour"

# === Apply mapping ===
df["general_category"] = df["topics"].fillna("").apply(map_to_general_category)
texts = df["question_text"].fillna("").astype(str).tolist()
labels = df["general_category"].astype(str).str.strip().str.lower().tolist()

# === Filter dataset ===
filtered = [(text, label) for text, label in zip(texts, labels) if label in expected_categories]
texts_filtered = [x for x, y in filtered]
labels_filtered = [y for x, y in filtered]

# === Debug ===
print(f"\n--- Category Debugging ---")
print(f"Categories loaded from cleaned_questions.json: {expected_categories}")
print(f"Unique mapped labels found in dataset: {sorted(set(labels_filtered))}")
print(f"Number of data points before filtering: {len(texts)}")
print(f"Number of data points after filtering (matching JSON categories): {len(texts_filtered)}")

if not labels_filtered:
    raise ValueError("No valid labels found in final_cleaned_dataset.csv that match categories in cleaned_questions.json.")
print("--- End Category Debugging ---\n")

# === Encode Labels (Using one consistent LabelEncoder) ===
# This is the ONE LabelEncoder that will map your string labels to integers
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels_filtered)

# Verify the classes the encoder learned - these should be your string categories
print(f"Label Encoder classes (fitted on string topics): {label_encoder.classes_}")

# Save this CORRECTLY fitted encoder for inference
with open("mood_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

# === Load RoBERTa and Generate Embeddings ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaModel.from_pretrained("roberta-base").to(device)
roberta_model.eval()

def roberta_embed_pooled(texts, batch_size=16, max_length=128):
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            encoded = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
            encoded = {k: v.to(device) for k, v in encoded.items()}
            output = roberta_model(**encoded)
            pooled = output.pooler_output
            embeddings.append(pooled.cpu().numpy())
    return np.vstack(embeddings)

embedding_file = "roberta_embeddings.npy"
# Ensure the embedding file reflects the filtering if you're re-running from scratch
# You might want to delete roberta_embeddings.npy if you changed filtering logic
if os.path.exists(embedding_file):
    print("📂 Loading cached embeddings...")
    try:
        X_emb = np.load(embedding_file)
        # Basic check to ensure loaded embeddings match filtered texts
        if X_emb.shape[0] != len(texts_filtered):
            print("Warning: Cached embeddings size mismatch. Re-generating embeddings.")
            os.remove(embedding_file) # Delete old file to force regeneration
            raise FileNotFoundError # Force regeneration
    except Exception as e:
        print(f"Error loading cached embeddings: {e}. Re-generating.")
        if os.path.exists(embedding_file):
            os.remove(embedding_file) # Delete old file
        X_emb = roberta_embed_pooled(texts_filtered)
        np.save(embedding_file, X_emb)
else:
    print("🔄 Generating embeddings...")
    X_emb = roberta_embed_pooled(texts_filtered)
    np.save(embedding_file, X_emb)

# === Balance Data with SMOTE ===
# SMOTE requires at least 2 samples per class for k_neighbors
# labels_encoded are already integer labels from the correctly fitted encoder
label_counts_for_smote = Counter(labels_encoded)
print(f"Label counts before SMOTE: {label_counts_for_smote}")

# Determine appropriate k_neighbors for SMOTE
# k_neighbors must be <= (number of samples in the smallest class - 1)
min_samples_in_any_class = min(label_counts_for_smote.values())
k_neighbors_val = min(min_samples_in_any_class - 1, 5) # Default to 5, but respect min_samples
if k_neighbors_val < 1: # If a class has only 1 sample, SMOTE cannot create neighbors
    print("Warning: Some classes have only one sample (or fewer than k_neighbors). SMOTE might not be effective or will fail for these classes.")
    print("Skipping SMOTE application.")
    X_resampled, Y_resampled = X_emb, labels_encoded
else:
    print(f"Applying SMOTE with k_neighbors={k_neighbors_val}")
    smote = SMOTE(random_state=42, k_neighbors=k_neighbors_val)
    X_resampled, Y_resampled = smote.fit_resample(X_emb, labels_encoded) # labels_encoded here are numbers
    print(f"Label counts after SMOTE: {Counter(Y_resampled)}")


# === One-hot encode labels ===
# Y_resampled already contains the correct numerical labels from the first label_encoder.
# We just need to convert them to one-hot encoding.
# DO NOT re-fit a new LabelEncoder here on Y_resampled.
num_classes = len(label_encoder.classes_) # Use the number of classes from the original encoder
Y_onehot = to_categorical(Y_resampled, num_classes=num_classes)

# === Split Dataset ===
# X_resampled is (num_samples, embedding_dim)
# Keras LSTM expects (num_samples, timesteps, features)
# Here, timesteps = 1, features = embedding_dim
X_train, X_test, Y_train, Y_test = train_test_split(
    np.expand_dims(X_resampled, axis=1).astype("float32"), # Add timestep dimension
    Y_onehot.astype("float32"),
    test_size=0.2,
    random_state=42,
    stratify=Y_resampled # Stratify using the numerical labels, not the one-hot
)

# === Attention Layer ===
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="glorot_uniform", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros", trainable=True)
        super().build(input_shape)

    def call(self, inputs):
        score = tf.nn.tanh(tf.tensordot(inputs, self.W, axes=1) + self.b)
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * inputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

# === Build Model ===
def build_model(seq_len, embedding_dim, num_classes):
    inp = Input(shape=(seq_len, embedding_dim))
    x = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(1e-4)))(inp)
    x = Dropout(0.4)(x)
    x = AttentionLayer()(x)
    x = Dense(64, activation='relu', kernel_regularizer=l2(1e-4))(x)
    x = Dropout(0.3)(x)
    out = Dense(num_classes, activation='softmax')(x)
    return Model(inputs=inp, outputs=out)

model = build_model(X_train.shape[1], X_train.shape[2], Y_train.shape[1])

# === Compile Model ===
model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# === Class Weights ===
# Y_resampled contains the numerical labels after SMOTE, which are what compute_class_weight expects
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(Y_resampled), y=Y_resampled)
class_weight_dict = dict(enumerate(class_weights))
print(f"Computed Class Weights: {class_weight_dict}")

In [None]:
import pickle
with open("mood_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)
print("Label Encoder Classes:", label_encoder.classes_)

In [None]:
# === Compile Model ===
lr_schedule = ExponentialDecay(
    initial_learning_rate=1e-4,
    decay_steps=500,
    decay_rate=0.9,
    staircase=True
)

optimizer = Adam(learning_rate=lr_schedule)

# Compile
model.compile(
    optimizer=optimizer,
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Callbacks WITHOUT ReduceLROnPlateau
callbacks = [
    EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True),
    ModelCheckpoint("roberta_bilstm_attention_best.keras", save_best_only=True)
]

try:
    history = model.fit(
        X_train, Y_train,
        validation_data=(X_test, Y_test),
        epochs=30,
        batch_size=16,
        class_weight=class_weight_dict,
        callbacks=callbacks,
        verbose=1
    )
except (tf.errors.ResourceExhaustedError, tf.errors.InternalError):
    print("GPU Memory Exhausted. Switching to CPU training with adjusted batch size...")
    tf.keras.backend.clear_session()
    with tf.device('/CPU:0'):
        model = build_model(X_train.shape[1], X_train.shape[2], Y_train.shape[1])
        model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])
        history = model.fit(
            X_train, Y_train,
            validation_data=(X_test, Y_test),
            epochs=15,
            batch_size=32,
            class_weight=class_weight_dict,
            callbacks=callbacks,
            verbose=1
        )

# === Save Final Model ===
model.save("roberta_bilstm_attention_final.keras")

# === Evaluate Model ===
print("\n--- Evaluating Model ---")
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# === Classification Report and Confusion Matrix ===
Y_pred_probs = model.predict(X_test)
Y_pred = np.argmax(Y_pred_probs, axis=1)
Y_true = np.argmax(Y_test, axis=1)

print("\n--- Classification Report ---")
target_names = label_encoder.inverse_transform(np.unique(Y_true))  # Class names sorted by index
print(classification_report(Y_true, Y_pred, target_names=target_names))

print("\n--- Confusion Matrix ---")
cm = confusion_matrix(Y_true, Y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))

# Accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label="Train Accuracy")
plt.plot(history.history['val_accuracy'], label="Val Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Training vs Validation Accuracy")
plt.legend()

# Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label="Train Loss")
plt.plot(history.history['val_loss'], label="Val Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()

plt.tight_layout()
plt.show()

In [5]:
import pandas as pd

# Define input and output file paths
input_file = 'cleaned_dataset.csv'
output_file = 'Final_Chatbot_Dataset.csv'

print(f"--- Preparing '{input_file}' for chatbot response bank ---")

try:
    # Load the new dataset
    df = pd.read_csv(input_file)
    print(f"Original shape of '{input_file}': {df.shape}")
    print(f"Columns: {df.columns.tolist()}")

    # Rename columns to match chatbot's expected structure
    df.rename(columns={
        'question_text': 'Context',
        'answer_text': 'Response',
        'topics': 'category'
    }, inplace=True)
    print(f"Columns after renaming: {df.columns.tolist()}")

    # Drop rows where 'Context', 'Response', or 'category' are missing
    initial_rows = df.shape[0]
    df_cleaned_for_chatbot = df.dropna(subset=['Context', 'Response', 'category'])
    removed_rows_na = initial_rows - df_cleaned_for_chatbot.shape[0]
    print(f"Removed {removed_rows_na} rows due to missing 'Context', 'Response', or 'category'.")

    # Optionally, remove duplicates again based on 'Context' and 'Response'
    initial_rows_after_na = df_cleaned_for_chatbot.shape[0]
    df_cleaned_for_chatbot.drop_duplicates(subset=['Context', 'Response'], keep='first', inplace=True)
    removed_rows_duplicates = initial_rows_after_na - df_cleaned_for_chatbot.shape[0]
    print(f"Removed {removed_rows_duplicates} duplicate rows based on 'Context' and 'Response'.")

    # === CRITICAL: Define the mapping for categories ===
    # Map granular categories from your CSV's 'topics' column
    # to the broader categories your LabelEncoder understands.
    # Adjust this mapping as needed based on your model's training.
    category_mapping = {
        # Depression & Anxiety
        'depression': 'depression & anxiety',
        'anxiety': 'depression & anxiety',
        'Anxiety': 'depression & anxiety', # Handle capitalization
        'Depression': 'depression & anxiety',
        'depression & anxiety': 'depression & anxiety', # If already combined
        'Depression,Anxiety': 'depression & anxiety', # If comma-separated
        'Anxiety,Depression': 'depression & anxiety',
        'Depression,Anxiety,Relationships': 'depression & anxiety', # Prioritize core categories
        'Anxiety,Depression,Behavioral Change': 'depression & anxiety',
        'Anxiety,Depression,Stress,Relationships': 'depression & anxiety',
        'Anxiety,Depression,Sleep Improvement': 'depression & anxiety',
        'Anxiety,Depression,Self-esteem': 'depression & anxiety',
        'Anxiety,Depression,Legal & Regulatory': 'depression & anxiety',
        'Depression,Anxiety,Diagnosis': 'depression & anxiety',
        'Depression,Anxiety': 'depression & anxiety', # Adding again for robustness
        'Depression,Anxiety,Behavioral Change,Marriage': 'depression & anxiety',
        'Depression,Anxiety,Sleep Improvement': 'depression & anxiety',


        # Stress & Coping
        'stress': 'stress & coping',
        'Stress': 'stress & coping',
        'stress & coping': 'stress & coping',
        'Anger Management': 'stress & coping',
        'anger-management': 'stress & coping',
        'Sleep Improvement': 'stress & coping',
        'sleep-improvement': 'stress & coping',
        'Behavioral Change': 'stress & coping',
        'behavioral-change': 'stress & coping',
        'Stress,Eating Disorders': 'stress & coping', # Prioritize core categories
        'Stress,Family Conflict': 'stress & coping',
        'Stress,Workplace Relationships': 'stress & coping',
        'Stress,Anxiety': 'stress & coping', # If linked with anxiety, map to broader
        'Anxiety,Stress': 'stress & coping',
        'Anger Management,Sleep Improvement': 'stress & coping',
        'Anger Management,Behavioral Change': 'stress & coping',
        'Behavioral Change,Anxiety': 'stress & coping',
        'Behavioral Change,Sleep Improvement': 'stress & coping',
        'Depression,Stress': 'stress & coping',


        # Trauma & PTSD
        'trauma': 'trauma & ptsd',
        'Trauma': 'trauma & ptsd',
        'trauma & ptsd': 'trauma & ptsd',
        'grief-and-loss': 'trauma & ptsd',
        'Grief and Loss': 'trauma & ptsd',
        'domestic-violence': 'trauma & ptsd',
        'Domestic Violence': 'trauma & ptsd',
        'military-issues': 'trauma & ptsd',
        'Military Issues ': 'trauma & ptsd',
        'Trauma,Relationships': 'trauma & ptsd', # Prioritize core categories
        'Trauma,Human Sexuality': 'trauma & ptsd',
        'Trauma,Anxiety': 'trauma & ptsd',
        'Trauma,Depression': 'trauma & ptsd',
        'Trauma,Depression,Anxiety': 'trauma & ptsd',
        'Grief and Loss,Trauma,Anxiety': 'trauma & ptsd',
        'Trauma,Grief and Loss': 'trauma & ptsd',
        'Domestic Violence,Anger Management': 'trauma & ptsd',
        'Domestic Violence,Relationships': 'trauma & ptsd',
        'Domestic Violence,Family Conflict': 'trauma & ptsd',
        'Domestic Violence,Sleep Improvement': 'trauma & ptsd',
        'Domestic Violence,Legal & Regulatory': 'trauma & ptsd',
        'Trauma,Stress,Anxiety,Anger Management': 'trauma & ptsd',
        'Trauma,Family Conflict': 'trauma & ptsd',
        'Trauma,Self-esteem,Relationship Dissolution ': 'trauma & ptsd',
        'Intimacy,Trauma': 'trauma & ptsd',
        'Trauma,Military Issues ': 'trauma & ptsd',
        'Trauma,Depression,Relationships,Intimacy': 'trauma & ptsd',
        'Family Conflict,Trauma': 'trauma & ptsd',
        'Grief and Loss,Substance Abuse,Trauma': 'trauma & ptsd',
        'Human Sexuality,Trauma,Intimacy': 'trauma & ptsd',
        'Human Sexuality,Trauma,Intimacy,Relationships': 'trauma & ptsd',


        # Personality & Behaviour (catch-all for other related categories)
        'personality & behaviour': 'personality & behaviour',
        'parenting': 'personality & behaviour',
        'self-esteem': 'personality & behaviour',
        'relationship-dissolution': 'personality & behaviour',
        'workplace-relationships': 'personality & behaviour',
        'spirituality': 'personality & behaviour',
        'intimacy': 'personality & behaviour',
        'substance-abuse': 'personality & behaviour',
        'family-conflict': 'personality & behaviour',
        'marriage': 'personality & behaviour',
        'eating-disorders': 'personality & behaviour',
        'relationships': 'personality & behaviour',
        'lgbtq': 'personality & behaviour',
        'addiction': 'personality & behaviour',
        'legal-regulatory': 'personality & behaviour',
        'professional-ethics': 'personality & behaviour',
        'human-sexuality': 'personality & behaviour',
        'social-relationships': 'personality & behaviour',
        'children-adolescents': 'personality & behaviour',
        'diagnosis': 'personality & behaviour',
        'counseling-fundamentals': 'personality & behaviour',
        'Family Conflict': 'personality & behaviour', # Capitalization
        'Self-esteem': 'personality & behaviour',
        'Parenting': 'personality & behaviour',
        'Relationship Dissolution ': 'personality & behaviour',
        'Workplace Relationships': 'personality & behaviour',
        'Spirituality': 'personality & behaviour',
        'Intimacy': 'personality & behaviour',
        'Substance Abuse': 'personality & behaviour',
        'Marriage': 'personality & behaviour',
        'Eating Disorders': 'personality & behaviour',
        'Relationships': 'personality & behaviour',
        'LGBTQ': 'personality & behaviour',
        'Addiction': 'personality & behaviour',
        'Legal & Regulatory': 'personality & behaviour',
        'Professional Ethics': 'personality & behaviour',
        'Human Sexuality': 'personality & behaviour',
        'Social Relationships': 'personality & behaviour',
        'Children & Adolescents': 'personality & behaviour',
        'Diagnosis': 'personality & behaviour',
        'Counseling Fundamentals ': 'personality & behaviour',

        # Compound Categories (will be mapped to one of the 4 main ones)
        'Substance Abuse,Addiction': 'personality & behaviour',
        'Behavioral Change,Social Relationships': 'personality & behaviour',
        'Professional Ethics,Legal & Regulatory': 'personality & behaviour',
        'Relationships,Marriage': 'personality & behaviour',
        'Marriage,Intimacy': 'personality & behaviour',
        'Family Conflict,Children & Adolescents': 'personality & behaviour',
        'Marriage,Relationship Dissolution ': 'personality & behaviour',
        'Relationships,Intimacy': 'personality & behaviour',
        'Anger Management,Parenting': 'personality & behaviour',
        'Family Conflict,Self-esteem,Parenting,Anxiety': 'personality & behaviour',
        'Human Sexuality,Marriage': 'personality & behaviour',
        'Spirituality,Family Conflict': 'personality & behaviour',
        'Social Relationships,Anxiety,Depression': 'personality & behaviour',
        'Family Conflict,Relationships': 'personality & behaviour',
        'Self-esteem,Relationships': 'personality & behaviour',
        'Family Conflict,Marriage': 'personality & behaviour',
        'Family Conflict,Self-esteem': 'personality & behaviour',
        'Parenting,Relationships': 'personality & behaviour',
        'Anxiety,Career Counseling': 'personality & behaviour',
        'Relationships,Self-esteem': 'personality & behaviour',
        'Relationships,Anxiety': 'personality & behaviour',
        'Eating Disorders,Addiction': 'personality & behaviour',
        'Workplace Relationships,Professional Ethics': 'personality & behaviour',
        'Anxiety,Spirituality': 'personality & behaviour',
        'Relationship Dissolution ,Relationships,Domestic Violence': 'personality & behaviour',
        'Parenting,Substance Abuse,Spirituality': 'personality & behaviour',
        'Self-esteem,Relationship Dissolution ': 'personality & behaviour',
        'Relationship Dissolution ,Marriage': 'personality & behaviour',
        'Relationship Dissolution ,Depression,Self-esteem': 'personality & behaviour',
        'Depression,Anger Management': 'personality & behaviour',
        'Parenting,Anger Management,Family Conflict': 'personality & behaviour',
        'Marriage,Family Conflict,Professional Ethics,Legal & Regulatory': 'personality & behaviour',
        'Relationships,Human Sexuality,LGBTQ': 'personality & behaviour',
        'Relationships,Parenting,Family Conflict': 'personality & behaviour',
        'LGBTQ,Intimacy': 'personality & behaviour',
        'Relationship Dissolution ,Depression': 'personality & behaviour',
        'Anger Management,Relationships': 'personality & behaviour',
        'Substance Abuse,Family Conflict': 'personality & behaviour',
        'Anxiety,Social Relationships,Self-esteem': 'personality & behaviour',
        'Self-esteem,Marriage,Trauma,Intimacy': 'personality & behaviour',
        'Marriage,Addiction': 'personality & behaviour',
        'Relationships,Legal & Regulatory': 'personality & behaviour',
        'Human Sexuality,Relationships': 'personality & behaviour',
        'Family Conflict,Relationships,Marriage': 'personality & behaviour',
        'Marriage,Anger Management': 'personality & behaviour',
        'Relationships,Family Conflict': 'personality & behaviour',
        'Anxiety,Behavioral Change': 'personality & behaviour',
        'Relationships,Depression': 'personality & behaviour',
        'Human Sexuality,Social Relationships': 'personality & behaviour',
        'Self-esteem,Eating Disorders': 'personality & behaviour',
        'Career Counseling,Professional Ethics': 'personality & behaviour',
        'Marriage,Grief and Loss': 'personality & behaviour',
        'Self-esteem,Social Relationships': 'personality & behaviour',
        'Depression,Relationships': 'personality & behaviour',
        'Addiction,Substance Abuse': 'personality & behaviour',
        'Workplace Relationships,Social Relationships': 'personality & behaviour',
        'Eating Disorders,Human Sexuality,Addiction': 'personality & behaviour',
        'Intimacy,Relationships': 'personality & behaviour',
        'Depression,Family Conflict': 'personality & behaviour',
        'Depression,Social Relationships': 'personality & behaviour',
        'Relationships,Self-esteem,Human Sexuality': 'personality & behaviour',
        'Behavioral Change,Depression': 'personality & behaviour',
        'Relationships,Human Sexuality': 'personality & behaviour',
        'Marriage,Family Conflict': 'personality & behaviour',
        'Relationships,Self-esteem,Anxiety': 'personality & behaviour',
        'Anxiety,Self-esteem,Workplace Relationships': 'personality & behaviour',
        'Human Sexuality,Intimacy,Marriage': 'personality & behaviour',
        'Relationships,Parenting': 'personality & behaviour',
        'Relationships,Family Conflict,Parenting': 'personality & behaviour',
        'Relationships,Social Relationships,Intimacy': 'personality & behaviour',
        'Anger Management,Domestic Violence': 'personality & behaviour',
        'Parenting,Family Conflict': 'personality & behaviour',
        'Anxiety,Parenting': 'personality & behaviour',
        'Family Conflict,Legal & Regulatory': 'personality & behaviour',
        'Domestic Violence,Marriage': 'personality & behaviour',
        'Intimacy,Human Sexuality,Relationships': 'personality & behaviour',
        'Intimacy,Social Relationships': 'personality & behaviour',
        'Anger Management,Relationships,Social Relationships': 'personality & behaviour',
        'Relationships,Relationship Dissolution ,Intimacy': 'personality & behaviour',
        'Relationships,Behavioral Change,Anxiety': 'personality & behaviour',
        'Family Conflict,Depression': 'personality & behaviour',
        'Parenting,Depression,Behavioral Change,Stress': 'personality & behaviour',
        'Family Conflict,Parenting,Marriage': 'personality & behaviour',
        'Marriage,Intimacy,Human Sexuality': 'personality & behaviour',
        'Self-esteem,Depression,Anxiety': 'personality & behaviour',
        'Marriage,Relationships,Intimacy': 'personality & behaviour',
        'Relationships,Human Sexuality,Family Conflict,Spirituality': 'personality & behaviour',
        'Relationships,Intimacy,Human Sexuality': 'personality & behaviour',
        'Parenting,Relationship Dissolution ,Family Conflict': 'personality & behaviour',
        'Addiction,Marriage,Intimacy': 'personality & behaviour',
        'Family Conflict,Social Relationships': 'personality & behaviour',
        'Anger Management,Depression,Relationships': 'personality & behaviour',
        'Social Relationships,Relationships,Intimacy': 'personality & behaviour',
        'Relationship Dissolution ,Depression,Social Relationships': 'personality & behaviour',
        'LGBTQ,Relationships,Intimacy': 'personality & behaviour',
        'Anger Management,Social Relationships,Relationships': 'personality & behaviour',
        'Anxiety,Social Relationships': 'personality & behaviour',
        'Intimacy,Marriage': 'personality & behaviour',
        'Addiction,Depression,Self-harm': 'personality & behaviour',
        'Addiction,Substance Abuse,Anxiety': 'personality & behaviour',
        'Depression,Grief and Loss': 'personality & behaviour',
        'Relationships,Anxiety,Self-esteem': 'personality & behaviour',
        'Anxiety,Relationships,Behavioral Change': 'personality & behaviour',
        'Family Conflict,Relationships,Intimacy': 'personality & behaviour',
        'Marriage,Relationship Dissolution ,Intimacy': 'personality & behaviour',
        'Family Conflict,LGBTQ': 'personality & behaviour',
        'Anxiety,Family Conflict': 'personality & behaviour',
        'Self-esteem,Stress,Anger Management': 'personality & behaviour',
        'Spirituality,Relationships': 'personality & behaviour',
        'Sleep Improvement,Anxiety': 'personality & behaviour',
        'Relationships,Workplace Relationships': 'personality & behaviour',
        'Human Sexuality,Intimacy,Relationships': 'personality & behaviour',
        'Human Sexuality,Intimacy': 'personality & behaviour',
        'Grief and Loss,Depression': 'personality & behaviour',
        'Marriage,Relationship Dissolution ,Behavioral Change': 'personality & behaviour',
        'Professional Ethics,Legal & Regulatory,Addiction': 'personality & behaviour',
        'Social Relationships,Substance Abuse': 'personality & behaviour',
        'Human Sexuality,Anxiety': 'personality & behaviour',
        'Relationships,Social Relationships': 'personality & behaviour',
        'Intimacy,Human Sexuality,Marriage': 'personality & behaviour',
        'Self-esteem,Sleep Improvement': 'personality & behaviour',
        'Social Relationships,Self-esteem': 'personality & behaviour',
        'Family Conflict,Stress,Sleep Improvement': 'personality & behaviour',
        'LGBTQ,Family Conflict': 'personality & behaviour',
        'Social Relationships,Children & Adolescents': 'personality & behaviour',
        'Behavioral Change,LGBTQ': 'personality & behaviour',
        'Grief and Loss,Family Conflict': 'personality & behaviour',
        'Self-esteem,Behavioral Change': 'personality & behaviour',
        'Social Relationships,Relationships,Addiction': 'personality & behaviour',
        'Social Relationships,LGBTQ': 'personality & behaviour',
        'Self-esteem,LGBTQ': 'personality & behaviour',
        'Legal & Regulatory,Professional Ethics': 'personality & behaviour',
        'Relationships,Professional Ethics,Parenting,Legal & Regulatory': 'personality & behaviour',
        'Eric Ström, JD, MA, LMHC': 'personality & behaviour', # This looks like a specific entity, not a category
        'Relationships,Trauma': 'personality & behaviour',
        'Domestic Violence,Sleep Improvement': 'personality & behaviour',
        'Human Sexuality,LGBTQ': 'personality & behaviour',
        'Social Relationships,Depression': 'personality & behaviour',
        'Relationship Dissolution ,Social Relationships': 'personality & behaviour',
        'Intimacy,Relationships,Human Sexuality': 'personality & behaviour',
        'Workplace Relationships,Career Counseling': 'personality & behaviour',
        'Behavioral Change,Relationships': 'personality & behaviour',
        "Alzheimer's,Family Conflict": 'personality & behaviour',
        'Self-esteem,Depression': 'personality & behaviour',
        'Human Sexuality,Relationships,Intimacy': 'personality & behaviour',
        'Social Relationships,Family Conflict': 'personality & behaviour',
        'Family Conflict,Anger Management': 'personality & behaviour',
        'Human Sexuality': 'personality & behaviour', # Added this if not mapped
        ' trauma history or having a cold the time it occurred.</span><span style=""line-height: 1.42857;"">&nbsp;&nbsp;</span>Working with someone who utilizes a behavioral approach': 'personality & behaviour', # Catch-all for very unusual entries

        # Default for unmapped categories
        'Unknown': 'personality & behaviour' # Assign unmapped to a default broad category
    }

    # Apply the mapping. Use .get() with a default value to handle unmapped categories.
    df_cleaned_for_chatbot['category'] = df_cleaned_for_chatbot['category'].apply(
        lambda x: category_mapping.get(x, 'personality & behaviour') # Default to personality & behaviour
    )

    print(f"Final shape of data for chatbot: {df_cleaned_for_chatbot.shape}")

    # Save the prepared dataset
    df_cleaned_for_chatbot.to_csv(output_file, index=False)
    print(f"Prepared dataset saved to: '{output_file}'")

    print("\nUnique categories found in the prepared dataset (after mapping):")
    unique_categories_after_mapping = df_cleaned_for_chatbot['category'].unique()
    print(unique_categories_after_mapping)

except FileNotFoundError:
    print(f"Error: The file '{input_file}' was not found. Please ensure it's uploaded.")
except Exception as e:
    print(f"An error occurred during preparation: {e}")

--- Preparing 'cleaned_dataset.csv' for chatbot response bank ---
Original shape of 'cleaned_dataset.csv': (10485, 5)
Columns: ['question_title', 'question_text', 'topics', 'answer_text', 'split']
Columns after renaming: ['question_title', 'Context', 'category', 'Response', 'split']
Removed 6979 rows due to missing 'Context', 'Response', or 'category'.
Removed 123 duplicate rows based on 'Context' and 'Response'.
Final shape of data for chatbot: (3383, 5)
Prepared dataset saved to: 'Final_Chatbot_Dataset.csv'

Unique categories found in the prepared dataset (after mapping):
['depression & anxiety' 'personality & behaviour' 'trauma & ptsd'
 'stress & coping']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_for_chatbot.drop_duplicates(subset=['Context', 'Response'], keep='first', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned_for_chatbot['category'] = df_cleaned_for_chatbot['category'].apply(


In [6]:
import json
import random
import pickle
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaModel
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Layer
import stanza
import pandas as pd

# === Custom Attention Layer (for model loading) ===
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="glorot_uniform", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros", trainable=True)
        super().build(input_shape)

    def call(self, inputs):
        score = tf.nn.tanh(tf.tensordot(inputs, self.W, axes=1) + self.b)
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * inputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

# === Load Model and Encoder ===
try:
    # Ensure the custom AttentionLayer is passed when loading the model
    model = load_model("roberta_bilstm_attention_final.keras", custom_objects={"AttentionLayer": AttentionLayer})
    with open("mood_encoder.pkl", "rb") as f:
        label_encoder = pickle.load(f)

    print("\n--- Label Encoder Information ---")
    print(f"Type of Label Encoder: {type(label_encoder)}")
    print(f"Classes known by the Label Encoder (these should be your category NAMES):")
    try:
        print(label_encoder.classes_)
    except AttributeError:
        print("    (Could not access .classes_ attribute. Is it a scikit-learn LabelEncoder?)")
    print("---------------------------------\n")

except FileNotFoundError as e:
    print(f"Error loading model or encoder: {e}. Make sure 'roberta_bilstm_attention_final.keras' and 'mood_encoder.pkl' are in the correct directory.")
    exit()

# === Load RoBERTa ===
# Check if CUDA (GPU) is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaModel.from_pretrained("roberta-base").to(device)
roberta_model.eval() # Set model to evaluation mode

# === Function to Embed Text ===
def embed_text(text):
    # Tokenize input text
    inputs = tokenizer([text], padding=True, truncation=True, max_length=128, return_tensors='pt')
    # Move tensors to the appropriate device (CPU/GPU)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad(): # Disable gradient calculation for inference
        outputs = roberta_model(**inputs)
    pooled = outputs.pooler_output # Get the pooled output for sentence representation
    return pooled.cpu().numpy().reshape(1, 1, -1).astype("float32")

# === Context Memory ===
class ContextMemory:
    def __init__(self, max_len=3):
        self.window = []
        self.max_len = max_len

    def add(self, user_input):
        self.window.append(user_input)
        if len(self.window) > self.max_len:
            self.window.pop(0) # Remove oldest input if window exceeds max_len

    def get_context(self):
        return " ".join(self.window)

context_memory = ContextMemory()

# === Load Response Bank from Cleaned CSV ===
def load_responses_from_cleaned_csv(file_path="Final_Chatbot_Dataset.csv"): # <<< CORRECTED FILE PATH HERE
    """
    Loads responses from the prepared CSV file ('Final_Chatbot_Dataset.csv').
    It expects 'Context', 'Response', and 'category' columns.
    """
    print(f"\n--- Loading responses from CSV: {file_path} ---")
    topic_to_responses = {}
    try:
        df = pd.read_csv(file_path)

        # Check for required columns
        required_columns = ['Context', 'Response', 'category']
        if not all(col in df.columns for col in required_columns):
            print(f"Error: CSV file '{file_path}' must contain all of {required_columns}.")
            return {}

        for index, row in df.iterrows():
            category = str(row['category']).strip()
            response = str(row['Response']).strip()

            # Skip rows if category or response is empty after stripping
            if not category or not response:
                continue

            # In this updated version, the 'category' column in CSV is already mapped
            # to match the LabelEncoder's classes, so no further splitting needed here.
            primary_category = category

            if primary_category not in topic_to_responses:
                topic_to_responses[primary_category] = []
            topic_to_responses[primary_category].append(response)

        print(f"Successfully loaded {len(df)} entries from '{file_path}'.")
        print(f"Categories loaded from CSV: {list(topic_to_responses.keys())}")
        return topic_to_responses

    except FileNotFoundError:
        print(f"Error: Cleaned CSV response bank file '{file_path}' not found. Ensure it's in the correct directory.")
        return {}
    except Exception as e:
        print(f"An error occurred loading CSV response bank from '{file_path}': {e}.")
        return {}

# === Load Response Bank from JSON (Original Function - for reference/fallback) ===
# You can uncomment the line below and comment the CSV loading line if you prefer to use JSON
def load_responses_from_json(file_path="cleaned_questions.json"):
    """
    Loads responses from a JSON file in the format { "category": "...", "questions": [...] }
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        topic_to_responses = {}
        for entry in data:
            category = entry.get("category", "Unknown")
            questions = []
            for section in entry.get("questions", []):
                # Assuming 'questions' here are actually the responses for the category
                questions += section.get("questions", [])
            topic_to_responses[category] = questions
        return topic_to_responses
    except FileNotFoundError:
        print(f"Warning: Response bank JSON file '{file_path}' not found. Using an empty response bank.")
        return {}
    except json.JSONDecodeError:
        print(f"Warning: Could not decode JSON from '{file_path}'. Check file format.")
        return {}

# === CHOOSE YOUR RESPONSE BANK SOURCE HERE ===
# The chatbot is now configured to use the cleaned CSV by default.
response_bank = load_responses_from_cleaned_csv(file_path="Final_Chatbot_Dataset.csv")

# If you wish to switch back to using the JSON file,
# comment the line above and uncomment the line below:
# response_bank = load_responses_from_json(file_path="cleaned_questions.json")

# --- DIAGNOSTIC PRINT ---
print("\n--- Response Bank Categories (from your chosen source) ---")
print("These are the actual string categories available to the chatbot:")
print(list(response_bank.keys()))
print("----------------------------------------------------------\n")


# === NLP Enrichment with Stanza ===
try:
    # stanza.download('en') # Uncomment this line if you're running for the first time
    # and Stanza models are not yet downloaded. This may take some time.
    nlp = stanza.Pipeline('en')
except Exception as e:
    print(f"Error initializing Stanza: {e}. NLP enrichment might not work.")
    nlp = None

def enrich_with_nlp(text):
    """
    Applies Stanza NLP to extract entities from the text.
    """
    if nlp:
        try:
            doc = nlp(text)
            entities = [(ent.text, ent.type) for sentence in doc.sentences for ent in sentence.ents]
            if entities:
                return f"🧠 NLP Insight — Entities: {entities}"
        except Exception as e:
            print(f"Error during Stanza NLP enrichment: {e}")
            return ""
    return ""

# === Select a Response ===
def select_response(predicted_topic, response_bank):
    """
    Selects a random response from the response_bank for the predicted topic.
    """
    responses = response_bank.get(predicted_topic, [])
    if responses:
        return random.choice(responses)
    # Fallback if no specific responses are found for the topic
    return "I'm here to support you. Can you share more about how you're feeling?"

# === Final Chatbot Function ===
def chatbot_reply(user_input):
    """
    Main function to generate chatbot's reply.
    """
    # Store input in context memory
    context_memory.add(user_input)
    context = context_memory.get_context()

    # Embed context and predict mood/topic
    embedding = embed_text(context)
    prediction = model.predict(embedding, verbose=0)[0]
    predicted_index = np.argmax(prediction)
    raw_predicted_topic = label_encoder.inverse_transform([predicted_index])[0]

    # Normalize the predicted topic to match response_bank keys
    predicted_topic_normalized = None
    for key in response_bank.keys():
        if key.lower() == str(raw_predicted_topic).lower():
            predicted_topic_normalized = key # Use the exact key from response_bank
            break

    if predicted_topic_normalized is None:
        predicted_topic_normalized = str(raw_predicted_topic) # Fallback to raw string
        print(f"Warning: Could not find a normalized key in response_bank for: '{raw_predicted_topic}'. This might lead to generic responses.")


    # --- DIAGNOSTIC PRINT FOR EACH PREDICTION ---
    print("\n--- Current Prediction Debugging ---")
    print(f"Input for embedding: '{context}'")
    print(f"Model raw prediction (softmax output): {prediction}")
    print(f"Predicted index from model output: {predicted_index}")
    print(f"Predicted topic from label_encoder: '{raw_predicted_topic}' (Type: {type(raw_predicted_topic)})")
    print(f"Normalized topic for lookup: '{predicted_topic_normalized}' (Type: {type(predicted_topic_normalized)})")
    print(f"Is '{predicted_topic_normalized}' found in response_bank keys? {predicted_topic_normalized in response_bank.keys()}")
    if predicted_topic_normalized not in response_bank.keys():
        print("     -> WARNING: Predicted topic does NOT match any category in your response bank.")
        print("     -> This is likely why you are getting the generic fallback response.")
    print("------------------------------------\n")

    # Get response + NLP insights using the normalized topic
    response = select_response(predicted_topic_normalized, response_bank)
    nlp_info = enrich_with_nlp(user_input)

    return f"📘 Detected Topic: {predicted_topic_normalized}\n💬 Suggested Response: {response}\n{nlp_info}"

# === Interactive Chat Loop ===
print("Chatbot initialized. Type 'quit' to exit.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        print("Exiting chatbot. Goodbye!")
        break

    reply = chatbot_reply(user_input)
    print("Bot:", reply)


--- Label Encoder Information ---
Type of Label Encoder: <class 'sklearn.preprocessing._label.LabelEncoder'>
Classes known by the Label Encoder (these should be your category NAMES):
['depression & anxiety' 'personality & behaviour' 'stress & coping'
 'trauma & ptsd']
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-30 20:59:24 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES



--- Loading responses from CSV: Final_Chatbot_Dataset.csv ---
Successfully loaded 3383 entries from 'Final_Chatbot_Dataset.csv'.
Categories loaded from CSV: ['depression & anxiety', 'personality & behaviour', 'trauma & ptsd', 'stress & coping']

--- Response Bank Categories (from your chosen source) ---
These are the actual string categories available to the chatbot:
['depression & anxiety', 'personality & behaviour', 'trauma & ptsd', 'stress & coping']
----------------------------------------------------------



Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 428kB [00:00, 5.45MB/s]                    
2025-05-30 20:59:24 INFO: Downloaded file to C:\Users\soham\stanza_resources\resources.json
2025-05-30 20:59:25 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2025-05-30 20:59:25 INFO: Using device: cpu
2025-05-30 20:59:25 INFO: Loading: tokenize
2025-05-30 20:59:25 INFO: Loading: mwt
2025-05-30 20:59:25 INFO: Loading: pos
2025-05-30 20:59:27 INFO: Loading: lemma
2025-05-30 20:59:28 INFO: Lo

Chatbot initialized. Type 'quit' to exit.

--- Current Prediction Debugging ---
Input for embedding: 'Hello '
Model raw prediction (softmax output): [0.32846504 0.07460541 0.09264397 0.5042856 ]
Predicted index from model output: 3
Predicted topic from label_encoder: 'trauma & ptsd' (Type: <class 'numpy.str_'>)
Normalized topic for lookup: 'trauma & ptsd' (Type: <class 'str'>)
Is 'trauma & ptsd' found in response_bank keys? True
------------------------------------

Bot: 📘 Detected Topic: trauma & ptsd
💬 Suggested Response: I am so sorry that this happened to you and am so glad that you were able to get away.  Your body is yours and yours alone.  I would highly recommend that you find a therapist who specializes in treating trauma in order to help you to heal from your ordeal.

Exiting chatbot. Goodbye!
