In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# Load the dataset
df = pd.read_csv('text_emotion.csv')

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs, mentions, hashtags, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

# Apply preprocessing to the content column
df['content'] = df['content'].apply(preprocess_text)

# Stopword removal and lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply stopword removal and lemmatization to the content column
df['processed_content'] = df['content'].apply(remove_stopwords_and_lemmatize)

In [3]:
# Tokenizing and padding the sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define maximum number of words to consider and maximum sequence length
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 250

# Initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['processed_content'])

# Convert the texts to sequences
sequences = tokenizer.texts_to_sequences(df['processed_content'])

# Pad the sequences
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print("Shape of data tensor:", data.shape)


Shape of data tensor: (40000, 250)


In [5]:
# CNN Model building
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# Model parameters
EMBEDDING_DIM = 100  # Dimension of the embedding layer

# Building the CNN model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='linear'))  # Assuming a single output for regression

model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=['mean_squared_error'])

print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 100)          1000000   
                                                                 
 conv1d (Conv1D)             (None, 246, 128)          64128     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1064257 (4.06 MB)
Trainable params: 1064257 (4.06 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [11]:
# Adding resudual to CNN
from tensorflow.keras.layers import Input, add
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Convert the emotion labels to numeric form using one-hot encoding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['sentiment'])
y = to_categorical(integer_encoded)

# Update the number of output neurons to match the number of emotion categories
num_emotions = y.shape[1]

# Redefine the model with softmax output layer
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_layer)
conv_layer = Conv1D(128, 5, activation='relu')(embedding_layer)

# Adding residual connection
residual = Conv1D(128, 5, activation='relu', padding='same')(conv_layer)
residual = Conv1D(128, 5, activation='relu', padding='same')(residual)
residual = add([conv_layer, residual])

pooling_layer = GlobalMaxPooling1D()(residual)
output_layer = Dense(num_emotions, activation='softmax')(pooling_layer)

model = Model(inputs=input_layer, outputs=output_layer)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 250)]                0         []                            
                                                                                                  
 embedding_4 (Embedding)     (None, 250, 100)             1000000   ['input_4[0][0]']             
                                                                                                  
 conv1d_12 (Conv1D)          (None, 246, 128)             64128     ['embedding_4[0][0]']         
                                                                                                  
 conv1d_13 (Conv1D)          (None, 246, 128)             82048     ['conv1d_12[0][0]']           
                                                                                            

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.1, verbose=1)

# Predict on the test set
y_pred = model.predict(X_test)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.26025


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 13 and the array at index 1 has size 104000

In [15]:
# Evaluate the model
accuracy = np.mean(np.argmax(y_test, axis=1) == np.argmax(y_pred, axis=1))
print(f"Accuracy: {accuracy}")

mse = mean_squared_error(y_test, y_pred)
# Convert one-hot encoded y_test back to class labels
y_test_labels = np.argmax(y_test, axis=1)

# Take the maximum predicted probability as the predicted class
y_pred_labels = np.argmax(y_pred, axis=1)

# Compute Pearson correlation coefficient for the predicted class labels
pearson_r = np.corrcoef(y_test_labels, y_pred_labels)[0, 1]

# Print the evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"Pearson Correlation Coefficient: {pearson_r}")

# Concordance Correlation Coefficient requires a custom implementation
def concordance_correlation_coefficient(y_true, y_pred):
    y_true_flat = np.argmax(y_true, axis=1)
    y_pred_flat = np.argmax(y_pred, axis=1)
    mean_true = np.mean(y_true_flat)
    mean_pred = np.mean(y_pred_flat)
    var_true = np.var(y_true_flat)
    var_pred = np.var(y_pred_flat)
    covariance = np.mean((y_true_flat - mean_true) * (y_pred_flat - mean_pred))
    ccc = (2 * covariance) / (var_true + var_pred + (mean_true - mean_pred) ** 2)
    return ccc

ccc = concordance_correlation_coefficient(y_test, y_pred)
print(f"Concordance Correlation Coefficient: {ccc}")


Accuracy: 0.26025
Mean Squared Error: 0.09468409419059753
Pearson Correlation Coefficient: 0.17752963312942446
Concordance Correlation Coefficient: 0.1772527530205531


### Further tune hyperparameters and implement multi output CNN with convulution techniques: Gated convolutions and attention augmented convolutions