In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# Load the dataset
df = pd.read_csv('text_emotion.csv')

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs, mentions, hashtags, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

# Apply preprocessing to the content column
df['content'] = df['content'].apply(preprocess_text)

# Stopword removal and lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply stopword removal and lemmatization to the content column
df['processed_content'] = df['content'].apply(remove_stopwords_and_lemmatize)

In [3]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['processed_content'])
sequences = tokenizer.texts_to_sequences(df['processed_content'])

# Padding
max_sequence_length = max(len(x) for x in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Encoding labels
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(df['sentiment'])
# Assuming sentiment is the emotion intensity, if not, replace 'sentiment' with the correct column
Y = to_categorical(encoded_Y)

# Splitting the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [5]:
# RNN & LSTM Model Building
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(len(np.unique(encoded_Y)), activation='softmax'))  # Output layer

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Model summary
print(model.summary())

# Model Training
model.fit(X_train, Y_train, batch_size=32, epochs=10, validation_split=0.2)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 100)           3180500   
                                                                 
 bidirectional_2 (Bidirecti  (None, 25, 128)           84480     
 onal)                                                           
                                                                 
 bidirectional_3 (Bidirecti  (None, 64)                41216     
 onal)                                                           
                                                                 
 dense_1 (Dense)             (None, 13)                845       
                                                                 
Total params: 3307041 (12.62 MB)
Trainable params: 3307041 (12.62 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/1

<keras.src.callbacks.History at 0x23b4f548c70>

In [6]:
# Model Evaluation
# Predictions
Y_pred = model.predict(X_test)

# Converting predictions and labels to highest probable emotions
Y_pred_max = np.argmax(Y_pred, axis=1)
Y_test_max = np.argmax(Y_test, axis=1)

# Mean Squared Error
mse = mean_squared_error(Y_test_max, Y_pred_max)
print(f"Mean Squared Error: {mse}")

# Pearson Correlation Coefficient
pearson_corr, _ = pearsonr(Y_test_max, Y_pred_max)
print(f"Pearson Correlation Coefficient: {pearson_corr}")

# Concordance Correlation Coefficient
def concordance_correlation_coefficient(y_true, y_pred):
    correlation = np.corrcoef(y_true, y_pred)[0,1]
    mean_true = np.mean(y_true)
    mean_pred = np.mean(y_pred)
    var_true = np.var(y_true)
    var_pred = np.var(y_pred)
    sd_true = np.std(y_true)
    sd_pred = np.std(y_pred)
    numerator = 2 * correlation * sd_true * sd_pred
    denominator = var_true + var_pred + (mean_true - mean_pred)**2
    return numerator / denominator

ccc = concordance_correlation_coefficient(Y_test_max, Y_pred_max)
print(f"Concordance Correlation Coefficient: {ccc}")

Mean Squared Error: 12.388
Pearson Correlation Coefficient: 0.184747345297376
Concordance Correlation Coefficient: 0.1845152073680443


### Further tune hyperparameters and convulution techniques: Gated convolutions and attention augmented convolutions

## BiLSTM and Attention

In [7]:
from tensorflow.keras.layers import Input, Concatenate, Permute, Reshape, Multiply, Lambda, Dropout
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

# Attention Layer
def attention_layer(inputs, neurons):
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, neurons))(a)
    a = Dense(neurons, activation='softmax')(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention = Multiply()([inputs, a_probs])
    return output_attention

# Model Building with BiLSTM and Attention
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=100, input_length=max_sequence_length)(input_layer)
bilstm = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
attention = attention_layer(bilstm, neurons=max_sequence_length)
bilstm_with_attention = Bidirectional(LSTM(32, return_sequences=False))(attention)
dropout = Dropout(0.5)(bilstm_with_attention)
output_layer = Dense(len(np.unique(encoded_Y)), activation='softmax')(dropout)

model_with_attention = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model_with_attention.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Model summary
print(model_with_attention.summary())

# Model Training
model_with_attention.fit(X_train, Y_train, batch_size=32, epochs=10, validation_split=0.2)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 25)]                 0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 25, 100)              3180500   ['input_1[0][0]']             
                                                                                                  
 bidirectional_4 (Bidirecti  (None, 25, 128)              84480     ['embedding_2[0][0]']         
 onal)                                                                                            
                                                                                                  
 permute (Permute)           (None, 128, 25)              0         ['bidirectional_4[0][0]'] 

<keras.src.callbacks.History at 0x23b5965fa60>

In [8]:
# Model Evaluation
# Predictions
Y_pred = model.predict(X_test)

# Converting predictions and labels to highest probable emotions
Y_pred_max = np.argmax(Y_pred, axis=1)
Y_test_max = np.argmax(Y_test, axis=1)

# Mean Squared Error
mse = mean_squared_error(Y_test_max, Y_pred_max)
print(f"Mean Squared Error: {mse}")

# Pearson Correlation Coefficient
pearson_corr, _ = pearsonr(Y_test_max, Y_pred_max)
print(f"Pearson Correlation Coefficient: {pearson_corr}")

# Concordance Correlation Coefficient
def concordance_correlation_coefficient(y_true, y_pred):
    correlation = np.corrcoef(y_true, y_pred)[0,1]
    mean_true = np.mean(y_true)
    mean_pred = np.mean(y_pred)
    var_true = np.var(y_true)
    var_pred = np.var(y_pred)
    sd_true = np.std(y_true)
    sd_pred = np.std(y_pred)
    numerator = 2 * correlation * sd_true * sd_pred
    denominator = var_true + var_pred + (mean_true - mean_pred)**2
    return numerator / denominator

ccc = concordance_correlation_coefficient(Y_test_max, Y_pred_max)
print(f"Concordance Correlation Coefficient: {ccc}")

Mean Squared Error: 12.388
Pearson Correlation Coefficient: 0.184747345297376
Concordance Correlation Coefficient: 0.1845152073680443
