In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import re
import os
import urllib.request
import zipfile
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


# Download and extract GloVe embeddings
glove_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
glove_zip_file = 'glove.6B.zip'

if not os.path.exists(glove_zip_file):
    print("Downloading GloVe embeddings...")
    urllib.request.urlretrieve(glove_url, glove_zip_file)

glove_embedding_file = 'glove.6B.300d.txt'

if not os.path.exists(glove_embedding_file):
    print("Extracting GloVe embeddings...")
    with zipfile.ZipFile(glove_zip_file, 'r') as z:
        z.extractall()

print("GloVe embeddings are ready!")

Downloading GloVe embeddings...
Extracting GloVe embeddings...
GloVe embeddings are ready!


In [35]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Embedding, TimeDistributed, Lambda, Conv1D, GlobalMaxPooling1D, Bidirectional, SpatialDropout1D, Flatten, GRU, MaxPooling1D, Concatenate, Attention, GlobalAveragePooling1D, MultiHeadAttention, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [2]:
pip install --upgrade tensorflow


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
  import nltk
  nltk.download('punkt')
  nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
# Step 2: Load and preprocess the data

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions and hashtags
    text = re.sub(r'\@\w+|\#','', text)
    
    # Remove non-ASCII characters and convert to lowercase
    text = text.encode("ascii", errors="ignore").decode().lower()

    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and stem the remaining words
    stemmer = SnowballStemmer("english")
    stop_words = set(stopwords.words("english"))
    tokens = [stemmer.stem(token) for token in tokens if token.isalpha() and token not in stop_words]
    
    return " ".join(tokens)

def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath)
    data['Text'] = data['Text'].apply(clean_text)
    return data


data = load_and_preprocess_data('/content/drive/MyDrive/new_data.csv')



In [50]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Embedding, TimeDistributed, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential


In [27]:
# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Text'], data['Label'], test_size=0.2, random_state=42)

# Define num_classes and encode labels
num_classes = len(np.unique(y_train))

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [28]:
# Step 4: Tokenize and pad sequences
max_features = 10000
max_length = 100

# Custom filter to remove special characters
custom_filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

tokenizer = Tokenizer(
    num_words=max_features,
    filters=custom_filters,
    lower=True,
    split=" ",
    char_level=False, # Set to True for character-level tokenization
    oov_token="<OOV>", # Out-of-vocabulary token for words not in the training data
)

tokenizer.fit_on_texts(X_train)

# Save tokenizer for future use (e.g., when preprocessing new data)
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding="post", truncating="post")

In [53]:
# Step 5: Load GloVe pre-trained word embeddings
def load_glove_embeddings(embedding_file, tokenizer):
    embeddings_index = {}
    with open(embedding_file, encoding='utf8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

embedding_matrix = load_glove_embeddings('glove.6B.300d.txt', tokenizer)


In [62]:

# Step 6: Build and train different deep learning models
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Bidirectional, SpatialDropout1D, TimeDistributed, Flatten, GRU, MaxPooling1D
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, LSTM, Conv1D, GlobalMaxPooling1D, Bidirectional, SpatialDropout1D, TimeDistributed, Flatten, MaxPooling1D, Concatenate, Attention, GlobalAveragePooling1D
from tensorflow.keras.models import Model

from tensorflow.keras.layers import MultiHeadAttention

# Transformer model
def create_transformer_model(embedding_matrix):
    input_layer = Input(shape=(max_length,))
    embedding_layer = Embedding(len(embedding_matrix), 300, weights=[embedding_matrix], input_length=max_length, trainable=False)(input_layer)
    transformer_block = MultiHeadAttention(num_heads=8, key_dim=300)(embedding_layer, embedding_layer)
    x = transformer_block
    for _ in range(2):
        x = MultiHeadAttention(num_heads=8, key_dim=300)(x, x)
    
    pooling_layer = GlobalAveragePooling1D()(x)
    dense_layer = Dense(64, activation='relu')(pooling_layer)
    output_layer = Dense(num_classes, activation='softmax')(dense_layer)
    return Model(inputs=input_layer, outputs=output_layer)
  

# CRNN model
def create_crnn_model(embedding_matrix):
    input_layer = Input(shape=(max_length,))
    embedding_layer = Embedding(len(embedding_matrix), 300, weights=[embedding_matrix], input_length=max_length, trainable=False)(input_layer)
    conv1d_layer = Conv1D(256, 5, activation='relu')(embedding_layer)
    max_pooling_layer = MaxPooling1D(pool_size=2)(conv1d_layer)
    gru_layer1 = GRU(256, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)(max_pooling_layer)
    gru_layer2 = GRU(128, dropout=0.3, recurrent_dropout=0.3)(gru_layer1)
    dense_layer1 = Dense(64, activation='relu')(gru_layer2)
    output_layer = Dense(num_classes, activation='softmax')(dense_layer1)
    return Model(inputs=input_layer, outputs=output_layer)

# Hierarchical Attention Network (HAN) model
def create_han_model(embedding_matrix):
    input_layer = Input(shape=(max_length,))
    embedding_layer = Embedding(len(embedding_matrix), embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False)(input_layer)
    x = Bidirectional(GRU(64, return_sequences=True))(embedding_layer)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    concatenated = Concatenate()([avg_pool, max_pool])
    output_layer = Dense(num_classes, activation='softmax')(concatenated)

    return Model(inputs=input_layer, outputs=output_layer)
# TextCNN model
def create_textcnn_model(embedding_matrix):
    input_layer = Input(shape=(max_length,))
    embedding_layer = Embedding(len(embedding_matrix), 300, weights=[embedding_matrix], input_length=max_length, trainable=False)(input_layer)
    conv_blocks = []
    filter_sizes = [2, 3, 4]

    for filter_size in filter_sizes:
        conv = Conv1D(filters=256, kernel_size=filter_size, padding='same', activation='relu', strides=1)(embedding_layer)
        conv = GlobalMaxPooling1D()(conv)
        conv_blocks.append(conv)
    concat = Concatenate()(conv_blocks)
    dropout = Dropout(0.3)(concat)
    dense_layer = Dense(64, activation='relu')(dropout)
    output_layer = Dense(num_classes, activation='softmax')(dense_layer)

    return Model(inputs=input_layer, outputs=output_layer)


# LSTM Model
def create_lstm_model(embedding_matrix):
    model = Sequential()
    model.add(Embedding(len(embedding_matrix), 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(SpatialDropout1D(0.3))
    model.add(LSTM(256, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
    model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model

# 1D CNN Model
def create_cnn_model(embedding_matrix):
    model = Sequential()
    model.add(Embedding(len(embedding_matrix), 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(Conv1D(256, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.3))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model

# Bidirectional LSTM Model
def create_bidirectional_lstm_model(embedding_matrix):
    model = Sequential()
    model.add(Embedding(len(embedding_matrix), 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(SpatialDropout1D(0.3))
    model.add(Bidirectional(LSTM(256, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)))
    model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model

# CNN-LSTM Model
def create_cnn_lstm_model(embedding_matrix):
    model = Sequential()
    model.add(Embedding(len(embedding_matrix), 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(Conv1D(256, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(256, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
    model.add(TimeDistributed(Dense(128, activation='relu')))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model

# GRU Model
def create_gru_model(embedding_matrix):
    model = Sequential()
    model.add(Embedding(len(embedding_matrix), 300, weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(SpatialDropout1D(0.3))
    model.add(GRU(256, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
    model.add(GRU(128, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model

    


models = {
    'Transformer': create_transformer_model,
    'CRNN': create_crnn_model,
    'Hierarchical Attention Network': create_han_model,
    'TextCNN': create_textcnn_model,
    'LSTM': create_lstm_model,
    'CNN': create_cnn_model,
    'Bidirectional LSTM': create_bidirectional_lstm_model,
    'CNN-LSTM': create_cnn_lstm_model,
    'GRU': create_gru_model,

}


In [66]:
from tensorflow.keras.callbacks import EarlyStopping

def train_and_evaluate(model, model_name):
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(f"Training {model_name} model...")
    model.fit(X_train_pad, y_train, validation_split=0.2, epochs=6, batch_size=32, verbose=1)
    _, test_acc = model.evaluate(X_test_pad, y_test, verbose=0)
    print(f"{model_name} model accuracy: {test_acc * 100:.2f}%")
    return test_acc

model_selection_results = []

for model_name, model_builder in models.items():
    model = model_builder(embedding_matrix)
    test_acc = train_and_evaluate(model, model_name)
    model_selection_results.append((model_name, test_acc))

best_model_name, _ = max(model_selection_results, key=lambda x: x[1])
print(f"Training and evaluating the best model: {best_model_name}...")
best_model_builder = models[best_model_name]
best_model = best_model_builder(embedding_matrix)

# Train the best model for more epochs
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
best_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
best_model.fit(X_train_pad, y_train, validation_split=0.2, epochs=100, batch_size=32, verbose=1, callbacks=[early_stopping])

# Evaluate the best model
_, test_acc = best_model.evaluate(X_test_pad, y_test, verbose=0)
print(f"Best model ({best_model_name}) final accuracy: {test_acc * 100:.2f}%")

# Save the best model
best_model.save(f"{best_model_name}_model.h5")

# Load the saved model to make new predictions
from tensorflow.keras.models import load_model

loaded_model = load_model(f"{best_model_name}_model.h5")
predictions = loaded_model.predict(X_test_pad)

# Generate classification report on the best model
from sklearn.metrics import classification_report, confusion_matrix

predicted_labels = np.argmax(predictions, axis=1)
print("Confusion Matrix:")
print(confusion_matrix(y_test, predicted_labels))
print("\nClassification Report:")
print(classification_report(y_test, predicted_labels, target_names=label_encoder.classes_))

Training Transformer model...
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6




Transformer model accuracy: 61.81%
Training CRNN model...
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
CRNN model accuracy: 62.21%
Training Hierarchical Attention Network model...
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Hierarchical Attention Network model accuracy: 65.98%
Training TextCNN model...
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6




TextCNN model accuracy: 62.94%




Training LSTM model...
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
LSTM model accuracy: 62.21%
Training CNN model...
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6




CNN model accuracy: 62.58%




Training Bidirectional LSTM model...
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6




Bidirectional LSTM model accuracy: 65.69%
Training CNN-LSTM model...
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6




CNN-LSTM model accuracy: 61.52%
Training GRU model...
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
GRU model accuracy: 62.21%
Training and evaluating the best model: Hierarchical Attention Network...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Best model (Hierarchical Attention Network) final accuracy: 65.61%
Confusion Matrix:
[[1423   31   37   16   24    5]
 [  65   26   19    2   13    1]
 [ 178   21   33    3   25    1]
 [  70   23    9   37   12    2]
 [ 133   37   14    9   86    3]
 [  70    7    4    4   11   15]]

Classification Report:
              precision    recall  f1-score   support

     alarmed       0.73      0.93      0.82      1536
    cautious       0.18      0.21      0.19       126
   concerned       0.28      0.13      0.18       261
  disengaged       0.52      0.24      0.33       153
  dismissive       0.50      0.30      0.38       282
    doubtful       0.56      0.14      0.22       111

    accuracy 