# **### Emotion Label Detection On GoEmotion Simplified Dataset**

In [None]:
pip install datasets #install datasets for loading GoEmotion Dataset

# Loading the data

In [None]:
#Importing necessary Libraries
import pandas as pd
import torchvision
from datasets import load_dataset
import random
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option("display.max_columns", None)

go_emotion_simplified = load_dataset('go_emotions')#loading dataset GoEmotion
# go_emotion_simplified_train = pd.DataFrame(go_emotion_simplified['train'])   ## While execution took more time
go_emotion_simplified_train = go_emotion_simplified['train'].to_pandas() ## Lesser execution time
go_emotion_simplified_validation = go_emotion_simplified['validation'].to_pandas()
go_emotion_simplified_test = go_emotion_simplified['test'].to_pandas()

go_emotion_simplified_all = pd.concat([go_emotion_simplified_train, go_emotion_simplified_validation , go_emotion_simplified_test], ignore_index=True)
# Number of labels assigned to text
go_emotion_simplified_all['labels_count'] = go_emotion_simplified_all['labels'].apply(lambda x: len(x))
# Get the labels name

text_labels = """ 0: admiration
        1: amusement
        2: anger
        3: annoyance
        4: approval
        5: caring
        6: confusion
        7: curiosity
        8: desire
        9: disappointment
        10: disapproval
        11: disgust
        12: embarrassment
        13: excitement
        14: fear
        15: gratitude
        16: grief
        17: joy
        18: love
        19: nervousness
        20: optimism
        21: pride
        22: realization
        23: relief
        24: remorse
        25: sadness 
        26: surprise
        27: neutral"""

labels_dict = dict()

# labels_dict[int(kval.split(':')[0].strip())] = kval.split(':')[-1].strip()

for f in [kval for kval in [kv for kv in text_labels.split("\n")]]:
    labels_dict[int(f.split(':')[0].strip())] = f.split(':')[-1].strip()

for class_name in labels_dict:
    go_emotion_simplified_all[labels_dict[class_name]] = go_emotion_simplified_all['labels'].apply(lambda x: 1 if class_name in x else 0) # Since we have numpy arrays in labels column



go_emotion_simplified_all.head(5)



#Merging 27 emotions into 13 emotions

In [None]:
# merge columns to create new categories
go_emotion_simplified_all['happiness'] = go_emotion_simplified_all['joy'] + go_emotion_simplified_all['amusement']
go_emotion_simplified_all['sadness'] = go_emotion_simplified_all['grief'] + go_emotion_simplified_all['sadness']
go_emotion_simplified_all['fear'] = go_emotion_simplified_all['fear'] + go_emotion_simplified_all['confusion'] +go_emotion_simplified_all['nervousness']
go_emotion_simplified_all['surprise'] = go_emotion_simplified_all['curiosity'] + go_emotion_simplified_all['surprise']
go_emotion_simplified_all['anger'] = go_emotion_simplified_all['anger'] + go_emotion_simplified_all['annoyance']
go_emotion_simplified_all['disgust'] = go_emotion_simplified_all['disappointment'] + go_emotion_simplified_all['disgust'] + go_emotion_simplified_all['disapproval']
go_emotion_simplified_all['anticipation'] = go_emotion_simplified_all['excitement'] + go_emotion_simplified_all['optimism']
go_emotion_simplified_all['realization'] = go_emotion_simplified_all['realization'] + go_emotion_simplified_all['pride']
go_emotion_simplified_all['desire'] = go_emotion_simplified_all['admiration'] + go_emotion_simplified_all['desire']
go_emotion_simplified_all['shame'] = go_emotion_simplified_all['embarrassment'] + go_emotion_simplified_all['remorse']
go_emotion_simplified_all['relief'] = go_emotion_simplified_all['relief'] + go_emotion_simplified_all['gratitude']
go_emotion_simplified_all['love'] = go_emotion_simplified_all['love'] + go_emotion_simplified_all['caring']


# keep desired columns and drop original columns
go_emotion_simplified_all = go_emotion_simplified_all[['text', 'happiness', 'sadness', 'fear', 'surprise', 'anger', 'disgust', 'approval','anticipation', 'realization','desire','shame','relief','love','neutral']]
Emotion_labels = go_emotion_simplified_all[['happiness', 'sadness', 'fear', 'surprise', 'anger', 'disgust', 'approval','anticipation', 'realization','desire','shame','relief','love','neutral']]
text = go_emotion_simplified_all[['text']]
# view the resulting dataframe
print(go_emotion_simplified_all.head(20))
print(Emotion_labels.head())


### **Data Analysis And Visualisation**


## **Bar Chart** 

In [None]:
import matplotlib.pyplot as plt



# Get the sum of each emotion column
emotion_counts = go_emotion_simplified_all.iloc[:, 1:].sum()

# Create a bar chart
plt.figure(figsize=(10, 6))
plt.bar(emotion_counts.index, emotion_counts.values)

# Set the title and labels for the plot
plt.title('Frequency of Emotion Categories')
plt.xlabel('Emotion Category')
plt.ylabel('Frequency')

# Rotate the x-axis labels to avoid overlap
plt.xticks(rotation=45)

# Show the plot
plt.show()

#Scatter Plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

 # Define colors for each emotion
colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan', 'magenta', 'yellow', 'grey', 'purple']

# Extract the desired emotions
emotions = ['happiness', 'sadness', 'fear', 'anger', 'surprise', 'disgust', 'approval', 'anticipation', 'realization', 'desire', 'shame', 'relief', 'love','neutral']

# Create a scatter plot of the emotions
fig, ax = plt.subplots(figsize=(10, 6))
for i, emotion in enumerate(emotions):
    ax.scatter(go_emotion_simplified_all.index, go_emotion_simplified_all[emotion], label=emotion, color=colors[i])

ax.legend()
ax.set_xlabel('Sample')
ax.set_ylabel('Emotion score')
ax.set_title('Emotion Scores in GoEmotions Dataset')
plt.show()

#Pie Chart for emotions



In [None]:
# Create a pie chart of the overall emotion frequencies
plt.figure(figsize=(10, 6))
plt.pie(emotion_counts.values, labels=emotion_counts.index, autopct='%1.1f%%')
plt.title('Frequency of Emotions in GoEmotions Dataset')
plt.show()

### **#Word Cloud**


In [None]:
!pip install wordcloud matplotlib pandas
text_data = go_emotion_simplified_all['text'].values
text = ' '.join(text_data)
from wordcloud import WordCloud

# Generate a word cloud image
wordcloud = WordCloud().generate(text)

# Display the generated image:
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
print(text_data.shape)

In [None]:
# Create a heatmap of the correlation matrix for the emotions
corr = go_emotion_simplified_all[emotions].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation between Emotions in GoEmotions Dataset')
plt.show()

In [None]:
#Length of the text 
go_emotion_simplified_all['text_len'] = go_emotion_simplified_all['text'].apply(lambda x: len(x.split()))
print(go_emotion_simplified_all['text_len'])

In [None]:
# Split the text into individual words and count their frequency
word_counts = go_emotion_simplified_all['text'].str.split(expand=True).stack().value_counts()

print(word_counts)

In [None]:
# Plot the word frequency distribution as a bar chart
plt.figure(figsize=(10, 6))

# Set the number of words to display in the chart
top_n = 50

word_counts.head(top_n).plot(kind='bar')
plt.title('Top {} Most Frequent Words'.format(top_n))
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()
plt.hist(word_counts.values, bins=100, log=True)
plt.title('Word Frequency Distribution')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

**Data Preprocessing**

In [None]:
%pip install numpy sklearn nltk #installing necessary libraries

In [None]:
tokens = text.split()  # splitting based on spaces
vocab = sorted(set(tokens))  # sorting and removing duplicates by using set()
vocab  # just printing the vocab so we can look at it

In [None]:
#Length of tokens and vocab
tokens_len = len(tokens)
vocab_len = len(vocab)

print(f"Tokens: {tokens_len}")
print(f"Vocab: {vocab_len}")

In [None]:
import re

def remove_special_chars(text):
    if isinstance(text, str):
        # Replace special characters with spaces
        text = re.sub(r'[^\w\s]', ' ', text)
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text)
        # Remove leading and trailing spaces
        text = text.strip()
        return text
    else:
        return text


go_emotion_simplified_all['text'] = go_emotion_simplified_all['text'].apply(remove_special_chars)
print(go_emotion_simplified_all.head())


In [None]:
# Count the frequency of each word after removing stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
word_counts = (go_emotion_simplified_all['text']
               .str.lower()
               .str.split()
               .apply(lambda x: [word for word in x if word not in stop_words])
               .explode()
               .value_counts())

# Set the number of words to display in the chart
top_n = 50

# Create a bar chart of the top N most frequent words
plt.figure(figsize=(10, 6))
word_counts.head(top_n).plot(kind='bar')
plt.title('Top {} Most Frequent Words (Excluding Stopwords)'.format(top_n))
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()


from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words

print(f"number of stopwords: {len(sklearn_stop_words)}")
print(sklearn_stop_words)

In [None]:
#Pos wordnet
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('wordnet')


def get_wordnet_pos(word):
    """Map the POS tag to the first character lemmatize() accepts."""

    try:  # download nltk's POS tagger if it doesn't exist
        nltk.data.find("taggers/averaged_perceptron_tagger")
    except LookupError:
        nltk.download("averaged_perceptron_tagger")
    tag = nltk.pos_tag([word])[0][1][0].upper()  # use ntlk's POS tagger on the word

    # now we need to convert from nltk to wordnet POS notations (for compatibility reasons)
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
    }

    return tag_dict.get(tag, wordnet.NOUN)  # return and default to noun if not found



# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define a function to remove stopwords, stem and lemmatize the text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Stem and lemmatize the filtered tokens
    stem_tokens = [stemmer.stem(token) for token in filtered_tokens]
    lemma_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in filtered_tokens]
    # Join the stemmed and lemmatized tokens back into a sentence
    preprocessed_text = ' '.join(lemma_tokens)
    return preprocessed_text

# Apply the preprocess_text function to the text column of the dataframe
go_emotion_simplified_all['text'] = go_emotion_simplified_all['text'].apply(preprocess_text)

# Print the first 10 rows of the cleaned text column
print(go_emotion_simplified_all['text'][:10])


In [None]:
from sklearn.preprocessing import LabelEncoder

# Define the 13 emotion labels
emotions = ['happiness', 'sadness', 'fear', 'surprise', 'anger', 'disgust', 'approval',
            'anticipation', 'realization','desire','shame','relief','love','neutral']

# Create a label encoder object
le = LabelEncoder()

# Fit the encoder to the emotion labels
le.fit(emotions)

# Transform the emotion labels to numerical values
encoded_emotions = le.transform(emotions)

# Print the encoded emotion labels
print(encoded_emotions)

# Print the emotion labels and the corresponding encoded values
print("Emotion Labels:")
for label, encoded_value in zip(emotions, le.transform(emotions)):
    print(label, ":", encoded_value)

# Print the encoder itself
print("\nLabelEncoder:", le)

In [None]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r"\w+|$[0-9.]+|\S+.>,<")
tokenizer.tokenize(text)

In [None]:
# create a copy of the dataframe
df_with_label = go_emotion_simplified_all.copy()

# create a new column 'Label' and set its values based on the max value in each row
df_with_label['label'] = df_with_label[['happiness', 'sadness', 'fear', 'surprise', 'anger', 'disgust', 'approval', 'anticipation', 'realization', 'desire', 'shame', 'relief', 'love', 'neutral']].apply(lambda x: x.idxmax(), axis=1)
# concatenate dataframes
go_emotion_simplified_all = pd.concat([go_emotion_simplified_all, df_with_label['label']], axis=1)
# print the resulting dataframe
print(go_emotion_simplified_all.head(20))


In [None]:
go_emotion_simplified_updated = go_emotion_simplified_all.drop(go_emotion_simplified_all.columns[1:15], axis=1)#keeping only necessary columns in the dataframe
print(go_emotion_simplified_updated.head())


### CNN Model

In [None]:
!pip install tensorflow #installing tensorflow

### CNN - Variation - 1 - Model Implementation

In [None]:
#CNN - 45% - without early stopping
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalMaxPooling1D, Conv1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt



# Preprocessing
X = go_emotion_simplified_updated['text'].values
y = go_emotion_simplified_updated['label'].values

# Convert the labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)

# Pad the sequences to a fixed length
maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)

# Convert the emotion labels to one-hot encoded vectors
num_classes = len(np.unique(y))
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
y_val = to_categorical(y_val, num_classes)

# Define the model architecture
input_shape = (maxlen,)
input_layer = Input(shape=input_shape)
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
conv1d_layer_1 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
maxpooling1d_layer_1 = GlobalMaxPooling1D()(conv1d_layer_1)
dropout_layer_1 = Dropout(rate=0.2)(maxpooling1d_layer_1)
dense_layer_1 = Dense(units=64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(rate=0.2)(dense_layer_1)
output_layer = Dense(units=num_classes, activation='softmax')(dropout_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 64
epochs = 10
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

# Make predictions on the test data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

# Calculate the evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Plot training and validation accuracy and loss
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()









## MisClassification Rate Calculation

In [None]:
import numpy as np

# Define the confusion matrix
conf_matrix = np.array([[391, 10, 27, 28, 90, 28, 31, 17, 243, 2, 8, 12, 8, 34],
                        [10, 173, 18, 46, 9, 8, 26, 18, 97, 2, 11, 7, 4, 21],
                        [53, 21, 143, 67, 46, 28, 28, 20, 251, 6, 17, 1, 2, 35],
                        [22, 43, 54, 468, 30, 15, 51, 27, 134, 8, 48, 11, 3, 35],
                        [95, 14, 57, 40, 172, 37, 29, 11, 275, 5, 7, 33, 7, 56],
                        [31, 9, 28, 7, 24, 139, 12, 10, 163, 6, 4, 10, 4, 31],
                        [18, 36, 16, 38, 13, 3, 629, 15, 69, 6, 12, 1, 0, 12],
                        [25, 18, 36, 43, 24, 16, 28, 241, 89, 4, 2, 14, 1, 31],
                        [212, 87, 190, 159, 210, 123, 125, 70, 1725, 27, 16, 52, 6, 173],
                        [8, 4, 12, 16, 11, 11, 6, 2, 65, 23, 5, 3, 2, 16],
                        [9, 16, 16, 31, 8, 1, 28, 4, 20, 1, 386, 3, 3, 8],
                        [25, 11, 11, 10, 29, 9, 9, 8, 65, 2, 2, 133, 19, 13],
                        [20, 1, 4, 1, 8, 4, 5, 2, 17, 2, 1, 18, 45, 4],
                        [37, 26, 32, 47, 34, 26, 25, 9, 236, 11, 14, 10, 2, 171]])

# Calculate the misclassification rate for each class
misclass_rate = 1 - np.diag(conf_matrix) / np.sum(conf_matrix, axis=1)

print("Misclassification rate for each class:")
for i, rate in enumerate(misclass_rate):
    print(f"Class {i}: {rate:.2%}")



### CNN With Early Stopping

In [None]:
#CNN  - 45% - 49.13% accuracy and 1.92% loss with early stopping
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalMaxPooling1D, Conv1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt



# Preprocessing
X = go_emotion_simplified_updated['text'].values
y = go_emotion_simplified_updated['label'].values

# Convert the labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)

# Pad the sequences to a fixed length
maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)

# Convert the emotion labels to one-hot encoded vectors
num_classes = len(np.unique(y))
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
y_val = to_categorical(y_val, num_classes)

# Define the model architecture
input_shape = (maxlen,)
input_layer = Input(shape=input_shape)
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
conv1d_layer_1 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
maxpooling1d_layer_1 = GlobalMaxPooling1D()(conv1d_layer_1)
dropout_layer_1 = Dropout(rate=0.2)(maxpooling1d_layer_1)
dense_layer_1 = Dense(units=64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(rate=0.2)(dense_layer_1)
output_layer = Dense(units=num_classes, activation='softmax')(dropout_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)


# Train the model
batch_size = 64
epochs = 10
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks=[early_stop] )


# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)

print('Test loss:', loss)
print('Test accuracy:', accuracy)



# Make predictions on the test data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

# Calculate the evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Plot training and validation accuracy and loss
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()










### CNN Variation 2 - Loss Function - KL Divergence Loss frunction from Categorical Cross Entropy Loss function

In [None]:
#CNN - final - 45% - 49.42 with early stopping and changed loss function to KLDivergence loss function variation 2/1
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalMaxPooling1D, Conv1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt



# Preprocessing
X = go_emotion_simplified_updated['text'].values
y = go_emotion_simplified_updated['label'].values

# Convert the labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)

# Pad the sequences to a fixed length
maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)

# Convert the emotion labels to one-hot encoded vectors
num_classes = len(np.unique(y))
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
y_val = to_categorical(y_val, num_classes)

# Define the model architecture
input_shape = (maxlen,)
input_layer = Input(shape=input_shape)
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
conv1d_layer_1 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
maxpooling1d_layer_1 = GlobalMaxPooling1D()(conv1d_layer_1)
dropout_layer_1 = Dropout(rate=0.2)(maxpooling1d_layer_1)
dense_layer_1 = Dense(units=64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(rate=0.2)(dense_layer_1)
output_layer = Dense(units=num_classes, activation='softmax')(dropout_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)
#changing loss function to Kullback Leibler Divergence Loss function
model.compile(optimizer='adam', loss='kullback_leibler_divergence', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)


# Train the model
batch_size = 64
epochs = 10
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks=[early_stop] )


# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)

print('Test loss:', loss)
print('Test accuracy:', accuracy)



# Make predictions on the test data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

# Calculate the evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Plot training and validation accuracy and loss
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()










## CNN Variation 2 - KL DIvergence Loss function and Optimizer changed to Adamax optmizer from Adam optimizer

In [None]:
#CNN - final - 45% - 49.42-49.42-53 with early stopping and changed loss function to KLDivergence loss function  and adamax optimizer from adam variation 2
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalMaxPooling1D, Conv1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt



# Preprocessing
X = go_emotion_simplified_updated['text'].values
y = go_emotion_simplified_updated['label'].values

# Convert the labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)

# Pad the sequences to a fixed length
maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)

# Convert the emotion labels to one-hot encoded vectors
num_classes = len(np.unique(y))
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
y_val = to_categorical(y_val, num_classes)

# Define the model architecture
input_shape = (maxlen,)
input_layer = Input(shape=input_shape)
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
conv1d_layer_1 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
maxpooling1d_layer_1 = GlobalMaxPooling1D()(conv1d_layer_1)
dropout_layer_1 = Dropout(rate=0.2)(maxpooling1d_layer_1)
dense_layer_1 = Dense(units=64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(rate=0.2)(dense_layer_1)
output_layer = Dense(units=num_classes, activation='softmax')(dropout_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adamax', loss='kullback_leibler_divergence', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)


# Train the model
batch_size = 64
epochs = 10
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks=[early_stop] )


# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)

print('Test loss:', loss)
print('Test accuracy:', accuracy)



# Make predictions on the test data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

# Calculate the evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Plot training and validation accuracy and loss
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()










MisClassification Rate Calculation

In [None]:
import numpy as np

# Define the confusion matrix
conf_matrix = np.array([[456, 4, 7, 13, 71, 8, 16, 16, 289, 0, 15, 16, 6, 12],
                             [7, 201, 5, 38, 6, 3, 19, 13, 121, 0, 12, 0, 3, 22],
                             [29, 14, 105, 69, 31, 13, 29, 31, 365, 0, 17, 4, 1, 10],
                             [16, 24, 14, 534, 23, 8, 46, 37, 170, 0, 50, 8, 3, 16],
                             [82, 14, 20, 43, 204, 16, 23, 14, 371, 1, 4, 22, 8, 16],
                             [23, 6, 8, 15, 66, 118, 8, 10, 191, 0, 4, 9, 4, 16],
                             [14, 15, 7, 34, 4, 0, 674, 17, 75, 0, 16, 2, 0, 10],
                             [14, 17, 9, 32, 13, 5, 17, 338, 112, 0, 1, 5, 2, 7],
                             [164, 28, 78, 114, 179, 32, 88, 75, 2306, 0, 11, 33, 9, 58],
                             [11, 1, 11, 17, 10, 9, 6, 5, 84, 0, 7, 3, 9, 11],
                             [4, 12, 5, 29, 4, 0, 31, 5, 18, 0, 422, 1, 1, 2],
                             [15, 8, 4, 8, 41, 3, 9, 4, 83, 0, 3, 125, 39, 4],
                             [4, 2, 1, 1, 12, 2, 2, 1, 17, 0, 3, 20, 65, 2],
                             [22, 12, 8, 39, 31, 7, 22, 16, 307, 0, 16, 8, 7, 185]])

# Calculate the misclassification rate for each class
misclass_rate = 1 - np.diag(conf_matrix) / np.sum(conf_matrix, axis=1)

print("Misclassification rate for each class:")
for i, rate in enumerate(misclass_rate):
    print(f"Class {i}: {rate:.2%}")

# CNN Variation - 3 - Hyper parameter tuning - batch_size = 64 to 32, epochs = 10 to 20

In [None]:
#CNN - final - 45% - 49.42-49.42-54 with early stopping and changed loss function to KLDivergence loss function  and adamax optimizer from adam variation 2
#Variation 3 - hyper parameter tuning - batch_size = 64 to 32, epochs = 10 to 20 - 53% accuracy - 1.65% test loss

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalMaxPooling1D, Conv1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt



# Preprocessing
X = go_emotion_simplified_updated['text'].values
y = go_emotion_simplified_updated['label'].values

# Convert the labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)

# Pad the sequences to a fixed length
maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)

# Convert the emotion labels to one-hot encoded vectors
num_classes = len(np.unique(y))
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
y_val = to_categorical(y_val, num_classes)


# Define the model architecture
input_shape = (maxlen,)
input_layer = Input(shape=input_shape)
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
conv1d_layer_1 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
maxpooling1d_layer_1 = GlobalMaxPooling1D()(conv1d_layer_1)
dropout_layer_1 = Dropout(rate=0.2)(maxpooling1d_layer_1)
dense_layer_1 = Dense(units=64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(rate=0.2)(dense_layer_1)
output_layer = Dense(units=num_classes, activation='softmax')(dropout_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adamax', loss='kullback_leibler_divergence', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

# Train the model
batch_size = 32 # Batch size changed 
epochs = 20 #epoch changed
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), callbacks=[early_stop])



# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)

print('Test loss:', loss)
print('Test accuracy:', accuracy)



# Make predictions on the test data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

# Calculate the evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Plot training and validation accuracy and loss
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()










In [None]:
import numpy as np

# Define the confusion matrix
conf_matrix = np.array([[ 423,5,16,11,74,15,22,18,296,3,17,9,7,13],
 [   6,194,10,37,8,4,25,14,107,0,13,3,2,27],
 [  33,18,124,70,28,14,24,30,338,0,18,4,1,16],
 [  17,23,28,529,7,8,47,38,176,5,49,5,1,16],
 [  79,14,31,32,193,10,31,14,381,5,5,16,7,20],
 [  37,5,13,10,28,136,12,5,197,1,4,6,4,20],
 [  13,15,7,34,4,2,682,17,64,1,14,2,0,13],
 [  15,19,9,31,13,3,22,331,110,0,1,8,2,8],
 [ 190,31,97,138,125,37,110,69,2255,5,10,23,8,77],
 [  11,2,8,16,7,5,6,4,85,17,9,4,2,8],
 [   2,11,2,33,5,0,27,4,20,2,420,2,1,5],
 [  13,7,3,9,31,3,9,5,89,0,4,135,33,5],
 [   6,1,3,1,18,0,3,2,18,0,3,17,59,1],
 [  28,12,13,39,19,5,25,10,304,4,14,8,5,194]])

# Calculate the misclassification rate for each class
misclass_rate = 1 - np.diag(conf_matrix) / np.sum(conf_matrix, axis=1)

print("Misclassification rate for each class:")
for i, rate in enumerate(misclass_rate):
    print(f"Class {i}: {rate:.2%}")

# CNN - Variation 4 - Without One hot Encoding

In [None]:
#CNN - final - 45% - 49.42-49.42-54 with early stopping and changed loss function to KLDivergence loss function  and adamax optimizer from adam variation 2
#Variation 2 - hyper parameter tuning - batch_size = 64 to 32, epochs = 10 to 20 - 53% accuracy - 1.65% test loss
#Variation - 3Text featurisation - without one hot encoding accuracy -3.18%, loss - 33%
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalMaxPooling1D, Conv1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt


# Preprocessing
X = go_emotion_simplified_updated['text'].values
y = go_emotion_simplified_updated['label'].values

# Convert the labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)

# Pad the sequences to a fixed length
maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)

# Define the model architecture
input_shape = (maxlen,)
input_layer = Input(shape=input_shape)
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
conv1d_layer_1 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
maxpooling1d_layer_1 = GlobalMaxPooling1D()(conv1d_layer_1)
dropout_layer_1 = Dropout(rate=0.2)(maxpooling1d_layer_1)
dense_layer_1 = Dense(units=64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(rate=0.2)(dense_layer_1)
output_layer = Dense(units=num_classes, activation='softmax')(dropout_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adamax', loss='kullback_leibler_divergence', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)


# Train the model
batch_size = 64
epochs = 10
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks=[early_stop] )


# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)

print('Test loss:', loss)
print('Test accuracy:', accuracy)



# Make predictions on the test data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = y_test.astype(int)

# Calculate the evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Plot training and validation accuracy and loss
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()










# CNN Variation - 5 - Change train validation test split into 70/30

In [None]:
#CNN - final - 45% - 49.42%-49.42%-54% with early stopping and changed loss function to KLDivergence loss function  and adamax optimizer from adam variation 2
#Variation - 4 Change train validation test split into 70/30 - 53% accuracy and 1.64% loss

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalMaxPooling1D, Conv1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt



# Preprocessing
X = go_emotion_simplified_updated['text'].values
y = go_emotion_simplified_updated['label'].values

# Convert the labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)

# Pad the sequences to a fixed length
maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)

# Convert the emotion labels to one-hot encoded vectors
num_classes = len(np.unique(y))
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
y_val = to_categorical(y_val, num_classes)

# Define the model architecture
input_shape = (maxlen,)
input_layer = Input(shape=input_shape)
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
conv1d_layer_1 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
maxpooling1d_layer_1 = GlobalMaxPooling1D()(conv1d_layer_1)
dropout_layer_1 = Dropout(rate=0.2)(maxpooling1d_layer_1)
dense_layer_1 = Dense(units=64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(rate=0.2)(dense_layer_1)
output_layer = Dense(units=num_classes, activation='softmax')(dropout_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adamax', loss='kullback_leibler_divergence', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)


# Train the model
batch_size = 64
epochs = 10
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks=[early_stop] )


# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)

print('Test loss:', loss)
print('Test accuracy:', accuracy)



# Make predictions on the test data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

# Calculate the evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Plot training and validation accuracy and loss
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()










# Some Hyper Parameter Tuning for accuracy

In [None]:
#CNN - final - 45% - 49.42-49.42-54 with early stopping and changed loss function to KLDivergence loss function  and adamax optimizer from adam variation 2
#batch size - 128 and train val test split -0.1
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalMaxPooling1D, Conv1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score, f1_score
import matplotlib.pyplot as plt



# Preprocessing
X = go_emotion_simplified_updated['text'].values
y = go_emotion_simplified_updated['label'].values

# Convert the labels to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)

# Pad the sequences to a fixed length
maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)

# Convert the emotion labels to one-hot encoded vectors
num_classes = len(np.unique(y))
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
y_val = to_categorical(y_val, num_classes)

# Define the model architecture
input_shape = (maxlen,)
input_layer = Input(shape=input_shape)
embedding_layer = Embedding(input_dim=10000, output_dim=100)(input_layer)
conv1d_layer_1 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
maxpooling1d_layer_1 = GlobalMaxPooling1D()(conv1d_layer_1)
dropout_layer_1 = Dropout(rate=0.2)(maxpooling1d_layer_1)
dense_layer_1 = Dense(units=64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(rate=0.2)(dense_layer_1)
output_layer = Dense(units=num_classes, activation='softmax')(dropout_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adamax', loss='kullback_leibler_divergence', metrics=['accuracy'])

# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, verbose=1)


# Train the model
batch_size = 128
epochs = 10
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks=[early_stop] )


# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)

print('Test loss:', loss)
print('Test accuracy:', accuracy)



# Make predictions on the test data
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)

# Calculate the evaluation metrics
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=1)

# Plot training and validation accuracy and loss
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()










### LSTM - Variation 1

In [None]:
#LSTM - accuracy 42%
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Accuracy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix



# Filter the data to include only the relevant columns and labels
go_emotion_simplified_updated = go_emotion_simplified_updated[['text', 'label']]


# Split the data into train and test sets
train_data, test_data = train_test_split(go_emotion_simplified_updated, test_size=0.2, random_state=42)

# Encode the labels using LabelEncoder
le = LabelEncoder()
le.fit(train_data['label'])

# Tokenize the text data and pad the sequences to a fixed length
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

max_sequence_length = 100

train_sequences = tokenizer.texts_to_sequences(train_data['text'])
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)

test_sequences = tokenizer.texts_to_sequences(test_data['text'])
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)




# Create the model
embedding_size = 100

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=embedding_size, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=len(le.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
epochs = 10
batch_size = 32

history = model.fit(train_sequences, to_categorical(le.transform(train_data['label'])), epochs=epochs, batch_size=batch_size, validation_data=(test_sequences, to_categorical(le.transform(test_data['label']))))

# Evaluate the model
test_loss, test_acc = model.evaluate(test_sequences, to_categorical(le.transform(test_data['label'])))
print('Test accuracy:', test_acc)
print("Test loss:", test_loss)


# Make predictions on the test data
y_true = le.transform(test_data['label'])
y_pred = model.predict(test_sequences).argmax(axis=-1)
#y_pred = model.predict_classes(test_sequences)

# Print the classification report and confusion matrix
print(classification_report(y_true, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plot the training and validation accuracy and loss
plt.figure(figsize=(8, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


## LSTM - Variation 2 - One Hot Encoding Used

In [None]:
#LSTM - accuracy 42% - one hot encoding used - variation 1 LSTM in datapreprocessing - 41% after variation
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Accuracy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from keras.utils import to_categorical

# Filter the data to include only the relevant columns and labels
go_emotion_simplified_updated = go_emotion_simplified_updated[['text', 'label']]

# Split the data into train and test sets
train_data, test_data = train_test_split(go_emotion_simplified_updated, test_size=0.2, random_state=42)

# Encode the labels using LabelEncoder
le = LabelEncoder()
le.fit(train_data['label'])

# Tokenize the text data and pad the sequences to a fixed length
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

max_sequence_length = 100

train_sequences = tokenizer.texts_to_sequences(train_data['text'])
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)

test_sequences = tokenizer.texts_to_sequences(test_data['text'])
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Encode the labels using LabelEncoder
train_labels = le.transform(train_data['label'])
test_labels = le.transform(test_data['label'])

# Convert the labels to one-hot encoded vectors
num_classes = len(set(train_data['label']))
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)



# Create the model
embedding_size = 100

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=embedding_size, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=len(le.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
epochs = 10
batch_size = 32

history = model.fit(train_sequences, to_categorical(le.transform(train_data['label'])), epochs=epochs, batch_size=batch_size, validation_data=(test_sequences, to_categorical(le.transform(test_data['label']))))



# Evaluate the model
test_loss, test_acc = model.evaluate(test_sequences, to_categorical(le.transform(test_data['label'])))
print('Test accuracy:', test_acc)

print("Test loss:", test_loss)


# Make predictions on the test data
y_true = le.transform(test_data['label'])
y_pred = model.predict(test_sequences).argmax(axis=-1)
#y_pred = model.predict_classes(test_sequences)

# Print the classification report and confusion matrix
print(classification_report(y_true, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plot the training and validation accuracy and loss
plt.figure(figsize=(8, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


# LSTM - Variation -3 - 70/30 Train test split from 80/20 - 42% accuracy, 3.28% loss

In [None]:
#LSTM - accuracy 43% - change label encoding to one hot encoding - variation 2 LSTM in datapreprocessing - 43% after variation
#Variation 3 - 70/30 Train test split from 80/20 - 42% accuracy, 3.28% loss
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Accuracy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from keras.utils import to_categorical

# Filter the data to include only the relevant columns and labels
go_emotion_simplified_updated = go_emotion_simplified_updated[['text', 'label']]

# Split the data into train and test sets
train_data, test_data = train_test_split(go_emotion_simplified_updated, test_size=0.3, random_state=42)

# Encode the labels using LabelEncoder
le = LabelEncoder()
le.fit(train_data['label'])

# Tokenize the text data and pad the sequences to a fixed length
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

max_sequence_length = 100

train_sequences = tokenizer.texts_to_sequences(train_data['text'])
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)

test_sequences = tokenizer.texts_to_sequences(test_data['text'])
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Encode the labels using LabelEncoder
train_labels = le.transform(train_data['label'])
test_labels = le.transform(test_data['label'])

# Convert the labels to one-hot encoded vectors
num_classes = len(set(train_data['label']))
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)



# Create the model
embedding_size = 100

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=embedding_size, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=len(le.classes_), activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
epochs = 10
batch_size = 32

history = model.fit(train_sequences, to_categorical(le.transform(train_data['label'])), epochs=epochs, batch_size=batch_size, validation_data=(test_sequences, to_categorical(le.transform(test_data['label']))))



# Evaluate the model
test_loss, test_acc = model.evaluate(test_sequences, to_categorical(le.transform(test_data['label'])))
print('Test accuracy:', test_acc)

print("Test loss:", test_loss)


# Make predictions on the test data
y_true = le.transform(test_data['label'])
y_pred = model.predict(test_sequences).argmax(axis=-1)
#y_pred = model.predict_classes(test_sequences)

# Print the classification report and confusion matrix
print(classification_report(y_true, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plot the training and validation accuracy and loss
plt.figure(figsize=(8, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


## LSTM Variation - 4 - #Optimizer - Adamax and Loss frunction KL Divergence Loss function

In [None]:
#LSTM - accuracy 43% - change label encoding to one hot encoding - variation 1 LSTM in datapreprocessing - 43% after variation
#Optimizer - Adamax and Loss frunction KL Divergence Loss function

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Accuracy
import matplotlib.pyplot as plt


import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from keras.utils import to_categorical

# Filter the data to include only the relevant columns and labels
go_emotion_simplified_updated = go_emotion_simplified_updated[['text', 'label']]

# Split the data into train and test sets
train_data, test_data = train_test_split(go_emotion_simplified_updated, test_size=0.2, random_state=42)

# Encode the labels using LabelEncoder
le = LabelEncoder()
le.fit(train_data['label'])

# Tokenize the text data and pad the sequences to a fixed length
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

max_sequence_length = 100

train_sequences = tokenizer.texts_to_sequences(train_data['text'])
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)

test_sequences = tokenizer.texts_to_sequences(test_data['text'])
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Encode the labels using LabelEncoder
train_labels = le.transform(train_data['label'])
test_labels = le.transform(test_data['label'])

# Convert the labels to one-hot encoded vectors
num_classes = len(set(train_data['label']))
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)





# Create the model
embedding_size = 100

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=embedding_size, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=len(le.classes_), activation='softmax'))

model.compile(optimizer='adamax', loss='kullback_leibler_divergence', metrics=['accuracy'])

# Train the model
epochs = 10
batch_size = 32

history = model.fit(train_sequences,
                    to_categorical(le.transform(train_data['label'])),
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(test_sequences, to_categorical(le.transform(test_data['label']))))




# Evaluate the model
test_loss, test_acc = model.evaluate(test_sequences, to_categorical(le.transform(test_data['label'])))
print('Test accuracy:', test_acc)

print("Test loss:", test_loss)


# Make predictions on the test data
y_true = le.transform(test_data['label'])
y_pred = model.predict(test_sequences).argmax(axis=-1)
#y_pred = model.predict_classes(test_sequences)

from sklearn.metrics import precision_recall_fscore_support

# y_true is the true labels and y_pred is the predicted labels
precision, recall, fscore, _ = precision_recall_fscore_support(y_true, y_pred, zero_division=1)

# print the precision, recall, and F-score for each class
for i in range(num_classes):
    print(f"Class {i}: precision={precision[i]}, recall={recall[i]}, F-score={fscore[i]}")

# Print the classification report and confusion matrix
print(classification_report(y_true, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plot the training and validation accuracy and loss
plt.figure(figsize=(8, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
#LSTM - accuracy 43% - change label encoding to one hot encoding - variation 1 LSTM in datapreprocessing - 43% after variation
#Optimizer - Adamax and Loss frunction KL Divergence Loss function
Test size - 0.1

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Accuracy
import matplotlib.pyplot as plt


import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from keras.utils import to_categorical

# Filter the data to include only the relevant columns and labels
go_emotion_simplified_updated = go_emotion_simplified_updated[['text', 'label']]

# Split the data into train and test sets
train_data, test_data = train_test_split(go_emotion_simplified_updated, test_size=0.1, random_state=42)

# Encode the labels using LabelEncoder
le = LabelEncoder()
le.fit(train_data['label'])

# Tokenize the text data and pad the sequences to a fixed length
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

max_sequence_length = 100

train_sequences = tokenizer.texts_to_sequences(train_data['text'])
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)

test_sequences = tokenizer.texts_to_sequences(test_data['text'])
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Encode the labels using LabelEncoder
train_labels = le.transform(train_data['label'])
test_labels = le.transform(test_data['label'])

# Convert the labels to one-hot encoded vectors
num_classes = len(set(train_data['label']))
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)





# Create the model
embedding_size = 100

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=embedding_size, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=len(le.classes_), activation='softmax'))

model.compile(optimizer='adamax', loss='kullback_leibler_divergence', metrics=['accuracy'])

# Train the model
epochs = 10
batch_size = 32

history = model.fit(train_sequences,
                    to_categorical(le.transform(train_data['label'])),
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(test_sequences, to_categorical(le.transform(test_data['label']))))




# Evaluate the model
test_loss, test_acc = model.evaluate(test_sequences, to_categorical(le.transform(test_data['label'])))
print('Test accuracy:', test_acc)

print("Test loss:", test_loss)


# Make predictions on the test data
y_true = le.transform(test_data['label'])
y_pred = model.predict(test_sequences).argmax(axis=-1)
#y_pred = model.predict_classes(test_sequences)

from sklearn.metrics import precision_recall_fscore_support

# y_true is the true labels and y_pred is the predicted labels
precision, recall, fscore, _ = precision_recall_fscore_support(y_true, y_pred, zero_division=1)

# print the precision, recall, and F-score for each class
for i in range(num_classes):
    print(f"Class {i}: precision={precision[i]}, recall={recall[i]}, F-score={fscore[i]}")

# Print the classification report and confusion matrix
print(classification_report(y_true, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plot the training and validation accuracy and loss
plt.figure(figsize=(8, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
#LSTM - accuracy 43% - change label encoding to one hot encoding - variation 1 LSTM in datapreprocessing - 43% after variation
#Optimizer - Adamax and Loss frunction KL Divergence Loss function
#test size - 0.1
#Batch size - 128

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Accuracy
import matplotlib.pyplot as plt


import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from keras.utils import to_categorical

# Filter the data to include only the relevant columns and labels
go_emotion_simplified_updated = go_emotion_simplified_updated[['text', 'label']]

# Split the data into train and test sets
train_data, test_data = train_test_split(go_emotion_simplified_updated, test_size=0.1, random_state=42)

# Encode the labels using LabelEncoder
le = LabelEncoder()
le.fit(train_data['label'])

# Tokenize the text data and pad the sequences to a fixed length
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

max_sequence_length = 100

train_sequences = tokenizer.texts_to_sequences(train_data['text'])
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length)

test_sequences = tokenizer.texts_to_sequences(test_data['text'])
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)

# Encode the labels using LabelEncoder
train_labels = le.transform(train_data['label'])
test_labels = le.transform(test_data['label'])

# Convert the labels to one-hot encoded vectors
num_classes = len(set(train_data['label']))
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)





# Create the model
embedding_size = 100

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=embedding_size, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=len(le.classes_), activation='softmax'))

model.compile(optimizer='adamax', loss='kullback_leibler_divergence', metrics=['accuracy'])

# Train the model
epochs = 10
batch_size = 128

history = model.fit(train_sequences,
                    to_categorical(le.transform(train_data['label'])),
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(test_sequences, to_categorical(le.transform(test_data['label']))))




# Evaluate the model
test_loss, test_acc = model.evaluate(test_sequences, to_categorical(le.transform(test_data['label'])))
print('Test accuracy:', test_acc)

print("Test loss:", test_loss)


# Make predictions on the test data
y_true = le.transform(test_data['label'])
y_pred = model.predict(test_sequences).argmax(axis=-1)
#y_pred = model.predict_classes(test_sequences)

from sklearn.metrics import precision_recall_fscore_support

# y_true is the true labels and y_pred is the predicted labels
precision, recall, fscore, _ = precision_recall_fscore_support(y_true, y_pred, zero_division=1)

# print the precision, recall, and F-score for each class
for i in range(num_classes):
    print(f"Class {i}: precision={precision[i]}, recall={recall[i]}, F-score={fscore[i]}")

# Print the classification report and confusion matrix
print(classification_report(y_true, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plot the training and validation accuracy and loss
plt.figure(figsize=(8, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


## **SVM**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score


vectorizer = CountVectorizer(stop_words='english')





# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(go_emotion_simplified_all['text'], go_emotion_simplified_all['label'], test_size=0.2, random_state=42)

# Vectorize the text data using the CountVectorizer
stop_words = {'a', 'an', 'the', ...}  # Set of stop words
stop_words_list = list(stop_words)   # Convert set to list
vectorizer = CountVectorizer(stop_words=stop_words_list)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train the SVM model
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

