In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.initializers import Constant
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Constants
MAX_LEN = 150
EMBEDDING_DIM = 300
VOCAB_SIZE = 20000

def load_data(file_path):
    """Load and preprocess dataset"""
    df = pd.read_csv(file_path)

    # Clean data
    df = df.dropna(subset=['Utterance', 'Emotion'])
    df['Utterance'] = df['Utterance'].str.lower()

    # Emotion mapping
    emotion_map = {'neutral':0, 'joy':1, 'sadness':2, 'anger':3,
                   'surprise':4, 'disgust':5, 'fear':6}
    df['label'] = df['Emotion'].map(emotion_map).dropna().astype(int)

    return df['Utterance'], df['label']

def create_embeddings(tokenizer):
    """Load pre-trained GloVe embeddings"""
    embeddings_index = {}
    with open('glove.6B.300d.txt', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
    for word, i in tokenizer.word_index.items():
        if i >= VOCAB_SIZE:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

def build_cnn_bilstm(embedding_matrix):
    """Build CNN-BiLSTM model with regularization"""
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM,
                        embeddings_initializer=Constant(embedding_matrix),
                        input_length=MAX_LEN,
                        trainable=False))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(7, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

def train_random_forest(X_train, X_test, y_train, y_test):
    """Train and evaluate Random Forest classifier"""
    tfidf = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    rf = RandomForestClassifier(n_estimators=200,
                                class_weight='balanced',
                                n_jobs=-1)
    rf.fit(X_train_tfidf, y_train)

    y_pred = rf.predict(X_test_tfidf)
    print("Random Forest Performance:")
    print(classification_report(y_test, y_pred))
    print(f"Macro F1: {f1_score(y_test, y_pred, average='macro'):.4f}\n")
    return tfidf, rf

def test_new_dataset(model, tokenizer, tfidf_vectorizer, rf_model, test_file_path):
    """Evaluate models on new test dataset"""
    # Load and preprocess test data
    test_texts, test_labels = load_data(test_file_path)

    # CNN-BiLSTM Evaluation
    test_seq = tokenizer.texts_to_sequences(test_texts)
    test_pad = pad_sequences(test_seq, maxlen=MAX_LEN)

    print("\nCNN-BiLSTM Performance on New Dataset:")
    y_probs = model.predict(test_pad)
    y_pred = np.argmax(y_probs, axis=1)
    print(classification_report(test_labels, y_pred))
    print(f"Macro F1: {f1_score(test_labels, y_pred, average='macro'):.4f}")

    # Random Forest Evaluation
    print("\nRandom Forest Performance on New Dataset:")
    X_test_tfidf = tfidf_vectorizer.transform(test_texts)
    y_pred_rf = rf_model.predict(X_test_tfidf)
    print(classification_report(test_labels, y_pred_rf))
    print(f"Macro F1: {f1_score(test_labels, y_pred_rf, average='macro'):.4f}")

def main():
    # Load and prepare data
    texts, labels = load_data('filtered_emotions_trimmed1.csv')
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )

    # Tokenization and sequencing
    tokenizer = Tokenizer(num_words=VOCAB_SIZE)
    tokenizer.fit_on_texts(X_train)

    train_seq = tokenizer.texts_to_sequences(X_train)
    test_seq = tokenizer.texts_to_sequences(X_test)

    X_train_pad = pad_sequences(train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(test_seq, maxlen=MAX_LEN)

    # Class weights for imbalance
    class_weights = compute_class_weight('balanced',
                                        classes=np.unique(y_train),
                                        y=y_train)
    class_weights = dict(enumerate(class_weights))

    # Build and train CNN-BiLSTM
    embedding_matrix = create_embeddings(tokenizer)
    model = build_cnn_bilstm(embedding_matrix)

    callbacks = [
        EarlyStopping(patience=3, restore_best_weights=True),
        ReduceLROnPlateau(factor=0.1, patience=2)
    ]

    print("Training CNN-BiLSTM Model:")
    history = model.fit(X_train_pad, y_train,
                        epochs=15,
                        batch_size=64,
                        validation_split=0.1,
                        class_weight=class_weights,
                        callbacks=callbacks)

    # Evaluate deep learning model
    print("\nCNN-BiLSTM Performance:")
    y_probs = model.predict(X_test_pad)
    y_pred = np.argmax(y_probs, axis=1)
    print(classification_report(y_test, y_pred))
    print(f"Macro F1: {f1_score(y_test, y_pred, average='macro'):.4f}")

    # Train and evaluate Random Forest
    tfidf, rf = train_random_forest(X_train, X_test, y_train, y_test)

    # Test on new dataset
    test_new_dataset(model, tokenizer, tfidf, rf, 'MELD Test Sent Emo.csv')  # Replace with your test file path

    return model, tokenizer, tfidf, rf

if __name__ == "__main__":
    cnn_model, tokenizer, tfidf_vectorizer, rf_model = main()


In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.initializers import Constant
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Constants
MAX_LEN = 150
EMBEDDING_DIM = 300
VOCAB_SIZE = 20000

def load_data(file_path):
    """Load and preprocess dataset"""
    df = pd.read_csv(file_path)
    df = df.dropna(subset=['Utterance', 'Emotion'])
    df['Utterance'] = df['Utterance'].str.lower()
    return df['Utterance'], df['Emotion']

def create_embeddings(tokenizer):
    """Load pre-trained GloVe embeddings"""
    embeddings_index = {}
    with open('glove.6B.300d.txt', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
    for word, i in tokenizer.word_index.items():
        if i >= VOCAB_SIZE:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def build_cnn_bilstm(embedding_matrix):
    """Build CNN-BiLSTM model with regularization"""
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM,
                        embeddings_initializer=Constant(embedding_matrix),
                        input_length=MAX_LEN,
                        trainable=False))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(7, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

def evaluate_model(model, X, y, model_name=""):
    """Generate detailed evaluation report"""
    y_pred = np.argmax(model.predict(X), axis=1)
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y, y_pred))
    print(f"{model_name} Macro F1: {f1_score(y, y_pred, average='macro'):.4f}")
    print(f"{model_name} Confusion Matrix:")
    print(confusion_matrix(y, y_pred))

def test_new_dataset(model, tokenizer, label_encoder, tfidf_vectorizer=None, rf_model=None):
    """Evaluate models on new test dataset"""
    # Load and preprocess new data
    new_texts, new_emotions = load_data('MELD Test Sent Emo.csv')
    new_labels = label_encoder.transform(new_emotions)

    # CNN-BiLSTM evaluation
    new_seq = tokenizer.texts_to_sequences(new_texts)
    new_pad = pad_sequences(new_seq, maxlen=MAX_LEN)
    evaluate_model(model, new_pad, new_labels, "CNN-BiLSTM - New Dataset")

    # Random Forest evaluation
    if rf_model and tfidf_vectorizer:
        new_tfidf = tfidf_vectorizer.transform(new_texts)
        y_pred_rf = rf_model.predict(new_tfidf)
        print("\nRandom Forest - New Dataset Classification Report:")
        print(classification_report(new_labels, y_pred_rf))
        print(f"Random Forest - New Dataset Macro F1: {f1_score(new_labels, y_pred_rf, average='macro'):.4f}")

def main():
    # Load and prepare data
    texts, emotions = load_data('filtered_emotions_trimmed1.csv')
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(emotions)
    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

    # Tokenization and sequencing
    tokenizer = Tokenizer(num_words=VOCAB_SIZE)
    tokenizer.fit_on_texts(X_train)
    train_seq = tokenizer.texts_to_sequences(X_train)
    test_seq = tokenizer.texts_to_sequences(X_test)
    X_train_pad = pad_sequences(train_seq, maxlen=MAX_LEN)
    X_test_pad = pad_sequences(test_seq, maxlen=MAX_LEN)

    # Class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights = dict(enumerate(class_weights))

    # Build and train CNN-BiLSTM
    embedding_matrix = create_embeddings(tokenizer)
    model = build_cnn_bilstm(embedding_matrix)
    callbacks = [
        EarlyStopping(patience=3, restore_best_weights=True),
        ReduceLROnPlateau(factor=0.1, patience=2)
    ]
    model.fit(X_train_pad, y_train,
              epochs=15,
              batch_size=64,
              validation_split=0.1,
              class_weight=class_weights,
              callbacks=callbacks)

    # Evaluate CNN-BiLSTM on test split
    evaluate_model(model, X_test_pad, y_test, "CNN-BiLSTM - Test Split")

    # Train and evaluate Random Forest
    tfidf = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)
    rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', n_jobs=-1)
    rf.fit(X_train_tfidf, y_train)
    print("\nRandom Forest - Test Split Classification Report:")
    print(classification_report(y_test, rf.predict(X_test_tfidf)))

    # Evaluate on MELD test set
    meld_texts, meld_emotions = load_data('MELD Test Sent Emo.csv')
    meld_labels = label_encoder.transform(meld_emotions)
    meld_seq = tokenizer.texts_to_sequences(meld_texts)
    meld_pad = pad_sequences(meld_seq, maxlen=MAX_LEN)
    evaluate_model(model, meld_pad, meld_labels, "CNN-BiLSTM - MELD Test Set")

    # Test on new dataset
    test_new_dataset(model, tokenizer, label_encoder, tfidf, rf)

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import numpy as np
import re
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, Bidirectional, LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score, classification_report, confusion_matrix

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    return text.lower()

def objective(trial):
    # Hyperparameter search space
    params = {
        'conv_filters': trial.suggest_categorical('conv_filters', [64, 128, 256]),
        'kernel_size': trial.suggest_int('kernel_size', 3, 5),
        'lstm_units': trial.suggest_categorical('lstm_units', [64, 128, 256]),
        'dense_layers': trial.suggest_int('dense_layers', 1, 3),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.3, 0.6),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-3, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [32, 64, 128]),
        'max_words': trial.suggest_int('max_words', 8000, 12000),
        'max_len': trial.suggest_int('max_len', 80, 120)
    }

    # Sequence preprocessing
    tokenizer = Tokenizer(num_words=params['max_words'], oov_token="<UNK>")
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=params['max_len'], padding='post', truncating='post')

    # Train-val split
    X_train, X_val, y_train, y_val = train_test_split(
        padded_sequences,
        categorical_labels,
        test_size=0.2,
        random_state=42,
        stratify=categorical_labels
    )

    # CNN+BiLSTM architecture
    input_layer = Input(shape=(params['max_len'],))
    embedding = Embedding(input_dim=params['max_words'], output_dim=128)(input_layer)
    conv = Conv1D(filters=params['conv_filters'], kernel_size=params['kernel_size'], activation='relu')(embedding)
    bilstm = Bidirectional(LSTM(params['lstm_units']))(conv)

    x = bilstm
    for _ in range(params['dense_layers']):
        x = Dense(trial.suggest_categorical(f'dense_units_{_}', [64, 128, 256]), activation='relu')(x)
        x = Dropout(params['dropout_rate'])(x)

    output_layer = Dense(len(label_encoder.classes_), activation='softmax')(x)
    model = Model(inputs=input_layer, outputs=output_layer)

    # Compile and train
    model.compile(
        optimizer=Adam(learning_rate=params['learning_rate']),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    history = model.fit(
        X_train,
        y_train,
        epochs=50,
        batch_size=params['batch_size'],
        validation_data=(X_val, y_val),
        callbacks=[EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)],
        class_weight=class_weight_dict,
        verbose=0
    )

    # Validation metrics
    val_preds = model.predict(X_val, verbose=0)
    val_pred_classes = np.argmax(val_preds, axis=1)
    val_true_classes = np.argmax(y_val, axis=1)
    return f1_score(val_true_classes, val_pred_classes, average='weighted')

# Main execution
if __name__ == "__main__":
    # Load and preprocess data
    df = pd.read_csv('filtered_emotions_trimmed.csv', low_memory=False)
    df['Utterance'] = df['Utterance'].apply(clean_text)
    texts = df['Utterance'].values
    labels = df['Emotion'].values

    # Label encoding and class weights
    label_encoder = LabelEncoder()
    integer_labels = label_encoder.fit_transform(labels)
    categorical_labels = to_categorical(integer_labels)

    class_weights = compute_class_weight('balanced',
                                        classes=np.unique(integer_labels),
                                        y=integer_labels)
    class_weight_dict = dict(zip(range(len(class_weights)), class_weights))

    # Hyperparameter optimization
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20)
    best_params = study.best_params

    # Final preprocessing with best parameters
    tokenizer = Tokenizer(num_words=best_params['max_words'], oov_token="<UNK>")
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=best_params['max_len'], padding='post', truncating='post')

    # Final model training
    X_train, X_val, y_train, y_val = train_test_split(
        padded_sequences,
        categorical_labels,
        test_size=0.2,
        random_state=42,
        stratify=categorical_labels
    )

    # Build final model
    input_layer = Input(shape=(best_params['max_len'],))
    embedding = Embedding(input_dim=best_params['max_words'], output_dim=128)(input_layer)
    conv = Conv1D(filters=best_params['conv_filters'], kernel_size=best_params['kernel_size'], activation='relu')(embedding)
    bilstm = Bidirectional(LSTM(best_params['lstm_units']))(conv)

    x = bilstm
    for _ in range(best_params['dense_layers']):
        x = Dense(best_params[f'dense_units_{_}'], activation='relu')(x)
        x = Dropout(best_params['dropout_rate'])(x)

    output_layer = Dense(len(label_encoder.classes_), activation='softmax')(x)
    final_model = Model(inputs=input_layer, outputs=output_layer)

    final_model.compile(
        optimizer=Adam(learning_rate=best_params['learning_rate']),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    final_model.fit(
        X_train,
        y_train,
        epochs=100,
        batch_size=best_params['batch_size'],
        validation_data=(X_val, y_val),
        callbacks=[EarlyStopping(monitor='val_loss', patience=15)],
        class_weight=class_weight_dict
    )

    # Test evaluation
    test_df = pd.read_csv('MELD Test Sent Emo.csv')
    test_df['Utterance'] = test_df['Utterance'].apply(clean_text)
    test_sequences = tokenizer.texts_to_sequences(test_df['Utterance'])
    test_padded = pad_sequences(test_sequences, maxlen=best_params['max_len'], padding='post', truncating='post')
    test_labels = label_encoder.transform(test_df['Emotion'])

    test_preds = final_model.predict(test_padded)
    test_pred_classes = np.argmax(test_preds, axis=1)

    print("\nTest Classification Report:")
    print(classification_report(test_labels, test_pred_classes, target_names=label_encoder.classes_))

    print("\nTest F1 Scores:")
    test_f1s = f1_score(test_labels, test_pred_classes, average=None)
    for idx, label in enumerate(label_encoder.classes_):
        print(f"{label}: {test_f1s[idx]:.4f}")

    print("\nConfusion Matrix:")
    print(confusion_matrix(test_labels, test_pred_classes))


[I 2025-05-04 05:14:41,293] A new study created in memory with name: no-name-e7cca019-0250-4816-a43d-57edcaf7fbb0
[I 2025-05-04 05:20:47,041] Trial 0 finished with value: 0.7765661739548247 and parameters: {'conv_filters': 64, 'kernel_size': 3, 'lstm_units': 128, 'dense_layers': 2, 'dropout_rate': 0.46566151330966354, 'learning_rate': 0.0001997430545776611, 'batch_size': 32, 'max_words': 9579, 'max_len': 95, 'dense_units_0': 256, 'dense_units_1': 128}. Best is trial 0 with value: 0.7765661739548247.
[I 2025-05-04 05:26:12,208] Trial 1 finished with value: 0.7740190534850018 and parameters: {'conv_filters': 64, 'kernel_size': 5, 'lstm_units': 256, 'dense_layers': 3, 'dropout_rate': 0.3600361346065597, 'learning_rate': 0.0006532040049129005, 'batch_size': 64, 'max_words': 8445, 'max_len': 111, 'dense_units_0': 256, 'dense_units_1': 256, 'dense_units_2': 128}. Best is trial 0 with value: 0.7765661739548247.
[I 2025-05-04 05:27:47,286] Trial 2 finished with value: 0.7838640024211949 and pa