# 1. Data Familiarisation

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import necessary libraries
import pandas as pd
import numpy as np

In [None]:
# read the dataset
data=pd.read_csv('/content/drive/MyDrive/tweet_emotions.csv')

In [None]:
data.head() # print the first 5 rows of the dataset

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [None]:
# we dont need the tweet_id oclumn, so we can remove that column
data=data.drop('tweet_id',axis=1)

In [None]:
# check for the shape of the dataset
data.shape

(40000, 2)

The dataset got 40000 rows and 2 columns

In [None]:
# lets check the missing values
data.isna().sum()

sentiment    0
content      0
dtype: int64

There is no missing values in the dataset

In [None]:
# check the info about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  40000 non-null  object
 1   content    40000 non-null  object
dtypes: object(2)
memory usage: 625.1+ KB


In [None]:
# lets check the values in sentiment column
data['sentiment'].value_counts()

sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

There are 13 different emotions means classes. so its a multi class classification problem.

# 2. Data Preprocessing

In [None]:
# import necessary libarraies
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [None]:
# down load punkt, stop words, wordnet
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# we can clean or pre process the text data
# this including removing html tags, punctuations and stop words from the content column
# also need to convert the text to lower case, do lemmetisation to convert the words into its root form


# create a function to remove punctuation from the string
# check the charcter by character then join then back

def remove_punctuations(text):
  punctuations=string.punctuation
  return ''.join([char for char in text if char not in punctuations])


In [None]:
# create a function to tokenise the text
# tokenise is the process of converting text into smaller units

def tokenise_text(text):
  return nltk.word_tokenize(text)

In [None]:
# create a function to remove the stop words
def remove_stopwords(list_of_words):
  return ''.join([word for word in list_of_words if word not in stopwords.words('english')])

In [None]:
# create a function to do the lemmatising
# lemmatising is the process of convertin the words into its base form

def lemmatise_text(list_of_words):
  lemmatizer=WordNetLemmatizer()
  return [lemmatizer.lemmatize(word) for word in list_of_words]

In [None]:
# combine all the functions to a single function

def preprocess(df_col):
  corpus = []
  for item in df_col:
    text=item.lower() # convert it into lower case
    text = re.sub(r'<.*?>', '', text) # remove html tags, substitute with null
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation and special characters  @#;substitute with null
    new_item = remove_punctuations(text)
    new_items=tokenise_text(new_item)
    new_item=remove_stopwords(new_items)
    new_items=lemmatise_text(new_items)
    corpus.append(' '.join(str(x) for x in new_items))
  return corpus

In [None]:
# lets preprocess the content

corpus = preprocess(data['content'])

In [87]:
corpus

# 3. Feature Extraction

In [None]:
# ML algorithm cannot process text kind of data, so we need to convert it into number representation
# so we are using count vectoriser for vectorisation of the text

from sklearn.feature_extraction.text import CountVectorizer
# create the object of CountVectorizer
cv=CountVectorizer()
# fit the data
X=cv.fit_transform(corpus) # train X data

# 4. Model Building and Evaluation


## 4.1 Using traditional machine learning models

In [None]:
# first conevrt teh target column into numeric column
# we will label encode the target column

from sklearn.preprocessing import LabelEncoder
# create encoder obj
enc=LabelEncoder()
# fit and transform the target column
y=enc.fit_transform(data['sentiment'])

print(y)

[ 2 10 10 ...  7  5  7]


In [None]:
# print the class label with calss name for target column
print(enc.inverse_transform([0,1,2,3,4,5,6,7,8,9,10,11,12]))

['anger' 'boredom' 'empty' 'enthusiasm' 'fun' 'happiness' 'hate' 'love'
 'neutral' 'relief' 'sadness' 'surprise' 'worry']


In [None]:
# we are using Random Forst classfier as traditional model

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# split the dataset into train & test set
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# Train the classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train)

# Evaluate on the train set
y_pred = clf.predict(x_test)

# lets print the train accuracy
accuracy_score(y_test,y_pred)


0.329625

## 4.2 Using Deep Learning model-LSTM

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# read the dataset
data=pd.read_csv('/content/drive/MyDrive/tweet_emotions.csv')

In [None]:
# down load punkt, stop words, wordnet
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# we are uisng LSTM model as Deep learning model
# LSTM stands for long short term memory
# this model is bale to memorise the long sequences

# function to pre process the text
def preprocess_text(text):
  text=text.lower() # convert it into lower case
  text = re.sub(r'<.*?>', '', text) # remove html tags, substitute with null
  text = re.sub(r'[^\w\s]', '', text) # Remove punctuation and special characters  @#;substitute with null
  words=word_tokenize(text)
  words=[i for i in words if i not in stopwords.words('english')]# remove stopwords
  lemmatise_obj=WordNetLemmatizer()
  words=[lemmatise_obj.lemmatize(i) for i in words]# lemmatise words, convert it to its stem form
  text= " ".join(words)
  text = re.sub(r'\s+', ' ', text).strip()#remove extra spaces
  return text

In [None]:
# prepare the cleaned text
data['cleaned_text']=data['content'].apply(lambda x: preprocess_text(x))

In [None]:
# Max number of words to tokenize
max_words = 1000
# instantiate tokenizer, OOv stands out of vocabulary
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
# fit the text data
tokenizer.fit_on_texts(data['cleaned_text'])
word_index = tokenizer.word_index # get teh word index

print(word_index)



In [None]:
len(word_index)

47504

In [None]:
# convert the text into sequence of indices

sequences=tokenizer.texts_to_sequences(data['cleaned_text'])
print(sequences)

[[1, 19, 1, 59, 1, 686, 474, 974, 364], [1, 208, 82, 301, 1, 165], [1, 1, 139], [25, 551, 61, 100], [1, 25, 1, 156, 1, 372, 17], [1, 1, 57, 7, 706, 707, 1, 57, 6, 61], [71, 2, 269, 145, 61, 25, 202, 1, 160, 33, 25, 1], [858], [1, 1, 9, 38], [1, 2, 58, 238, 139], [16, 605, 496], [1, 1], [197, 790, 281, 141, 5, 80, 1], [1, 12, 129, 1, 614, 12, 1, 1, 51, 51, 230, 129, 1], [8, 394], [1, 1, 216], [1, 1], [801, 90, 230, 453], [1, 445, 1, 1, 1, 267, 1, 19], [1, 112, 262, 1, 213, 1, 27, 190, 128, 61], [1, 34, 59, 43, 5, 65, 72, 71, 859, 1], [764, 2, 422, 1, 29, 1, 1, 1, 1, 1, 160, 1, 1, 337], [1, 1, 236, 1, 1, 1, 1, 373, 687, 1, 1], [619, 130, 10, 19, 138, 1, 55, 274], [120, 27, 2, 487, 14, 5, 834, 1, 14, 82, 1], [70, 30, 208, 818, 323, 1, 104, 624, 1, 594, 649, 388, 466, 1, 39, 145], [1, 2, 58, 77, 1, 1, 5, 1, 19, 65, 1], [1, 1, 35, 37, 343, 1, 819], [1, 1, 319, 719, 120, 197, 79, 83, 15, 71], [79, 1, 79, 79, 79], [895, 150, 1, 359, 802, 26], [1, 16, 71], [166, 1, 649], [39, 1, 1, 25, 7], [1,

In [None]:
# one problem with text data is its varying length
# so we need to make the input shape same
# for that we are using padding

# for that first we need to get the max length of the sequences

max_len=max([len(seq) for seq in sequences])
print(max_len)


25


In [None]:
# lets convert the sequence into padded sequences
# we are using post padding
# means zero will added at the end
padded_sequences=pad_sequences(sequences,maxlen=max_len,padding='post')
print(padded_sequences)

[[  1  19   1 ...   0   0   0]
 [  1 208  82 ...   0   0   0]
 [  1   1 139 ...   0   0   0]
 ...
 [ 18  23   3 ...   0   0   0]
 [  1   1 284 ...   0   0   0]
 [  1   1 739 ...   0   0   0]]


In [None]:
data.columns

Index(['tweet_id', 'sentiment', 'content', 'cleaned_text'], dtype='object')

In [None]:
# lets encode the target column
# before using to_categorical, we need to encode it numeric value

encoder_obj=LabelEncoder()
# convert to numerica value
y_numeric=encoder_obj.fit_transform(data['sentiment'])
# convert to binary columns
y = to_categorical(y_numeric)

In [None]:
# split the data into train & test set
# 20 % of the data will be used for text set
x_train,x_test,y_train,y_test=train_test_split(padded_sequences,y,test_size=0.2,random_state=42)

In [None]:
# calculate the data vocabulary size
# word index start with 1, thats y we are adding 1
vocab_size=len(tokenizer.word_index)+1

print(vocab_size)

47505


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D,Dropout # ID dropout layer
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.optimizers import Adam

In [70]:
# create LSTM model
model = Sequential()
# embedding layer gives the vectors of dimension 100
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_len))
# adding dropout layer
model.add(SpatialDropout1D(0.2))
# add LSTM layer
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
# add dense layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # adding a dropout layer
model.add(Dense(64, activation='relu'))  # adding another dense layer
# output layer
# in this problem, we have 13 classes, which is why we have 13 nodes at the output layer
# we are using the activation function 'softmax' since it's a multiclass classification problem
model.add(Dense(13, activation='softmax'))

model.build(input_shape=(None, max_len))
# compile the model
# loss function is categorical_crossentropy since it is a multi-class classification problem
# using Adam optimizer with a learning rate of 0.001
# Accuracy metric is used for evaluating the model's performance
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [71]:
model.summary()

Note: There are 4,852,929 trainable parameters.

In [72]:
from tensorflow.keras.callbacks import EarlyStopping

# epochs means how many times training will happen
# batch size means no inputs process at teh same time
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True) # Stop training when validation loss stops improving
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test), verbose=2, callbacks=[early_stopping])

Epoch 1/10
1000/1000 - 46s - 46ms/step - accuracy: 0.2131 - loss: 2.1695 - val_accuracy: 0.2175 - val_loss: 2.1471
Epoch 2/10
1000/1000 - 82s - 82ms/step - accuracy: 0.2546 - loss: 2.0724 - val_accuracy: 0.3039 - val_loss: 2.0182
Epoch 3/10
1000/1000 - 84s - 84ms/step - accuracy: 0.3155 - loss: 1.9916 - val_accuracy: 0.3033 - val_loss: 1.9954
Epoch 4/10
1000/1000 - 37s - 37ms/step - accuracy: 0.3374 - loss: 1.9514 - val_accuracy: 0.3288 - val_loss: 1.9522
Epoch 5/10
1000/1000 - 41s - 41ms/step - accuracy: 0.3450 - loss: 1.9304 - val_accuracy: 0.3330 - val_loss: 1.9418
Epoch 6/10
1000/1000 - 43s - 43ms/step - accuracy: 0.3541 - loss: 1.9083 - val_accuracy: 0.3377 - val_loss: 1.9326
Epoch 7/10
1000/1000 - 36s - 36ms/step - accuracy: 0.3598 - loss: 1.8922 - val_accuracy: 0.3392 - val_loss: 1.9270
Epoch 8/10
1000/1000 - 44s - 44ms/step - accuracy: 0.3646 - loss: 1.8744 - val_accuracy: 0.3411 - val_loss: 1.9189
Epoch 9/10
1000/1000 - 39s - 39ms/step - accuracy: 0.3727 - loss: 1.8581 - val_a

In [73]:
# after 10 epochs also model performace is low. so lets train the model for 10 more epochs starting from 11th epoch
history = model.fit(x_train, y_train,initial_epoch=11,epochs=20, batch_size=32, validation_data=(x_test, y_test), verbose=2)


Epoch 12/20
1000/1000 - 44s - 44ms/step - accuracy: 0.3680 - loss: 1.8588 - val_accuracy: 0.3438 - val_loss: 1.9218
Epoch 13/20
1000/1000 - 80s - 80ms/step - accuracy: 0.3751 - loss: 1.8467 - val_accuracy: 0.3425 - val_loss: 1.9250
Epoch 14/20
1000/1000 - 83s - 83ms/step - accuracy: 0.3796 - loss: 1.8356 - val_accuracy: 0.3409 - val_loss: 1.9361
Epoch 15/20
1000/1000 - 80s - 80ms/step - accuracy: 0.3817 - loss: 1.8236 - val_accuracy: 0.3435 - val_loss: 1.9247
Epoch 16/20
1000/1000 - 38s - 38ms/step - accuracy: 0.3862 - loss: 1.8093 - val_accuracy: 0.3404 - val_loss: 1.9349
Epoch 17/20
1000/1000 - 40s - 40ms/step - accuracy: 0.3886 - loss: 1.8020 - val_accuracy: 0.3385 - val_loss: 1.9565
Epoch 18/20
1000/1000 - 53s - 53ms/step - accuracy: 0.3956 - loss: 1.7909 - val_accuracy: 0.3394 - val_loss: 1.9473
Epoch 19/20
1000/1000 - 37s - 37ms/step - accuracy: 0.3997 - loss: 1.7811 - val_accuracy: 0.3375 - val_loss: 1.9592
Epoch 20/20
1000/1000 - 43s - 43ms/step - accuracy: 0.4050 - loss: 1.769

In [74]:
# testing on un seen data
x_sample='I love you'
x_sample=preprocess_text(x_sample)
x_sample=tokenizer.texts_to_sequences([x_sample])
x_sample=pad_sequences(x_sample,maxlen=max_len,padding='post')
y_pred = (model.predict(x_sample) > 0.5).astype('int32')

# Get the index of the predicted class, to get the class no
predicted_index = y_pred.argmax(axis=-1)[0]
# Inverse transform to get the class name
predicted_class = encoder_obj.inverse_transform([predicted_index])[0]
print(f"Predicted class: {predicted_class}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step
Predicted class: love


In [76]:
# testing on un seen data
x_sample='I want to kill you'
x_sample=preprocess_text(x_sample)
x_sample=tokenizer.texts_to_sequences([x_sample])
x_sample=pad_sequences(x_sample,maxlen=max_len,padding='post')
y_pred = (model.predict(x_sample) > 0.5).astype('int32')
# Get the index of the predicted class, to get the class no
predicted_index = y_pred.argmax(axis=-1)[0]
# Inverse transform to get the class name
predicted_class = encoder_obj.inverse_transform([predicted_index])[0]
print(f"Predicted class: {predicted_class}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Predicted class: anger


## 4.3 Using GRU model

In [84]:
# Create a GRU model
from tensorflow.keras.layers import GRU

model6=Sequential()
model6.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_len))
model6.add(SpatialDropout1D(0.2))
model6.add(GRU(100, dropout=0.2, recurrent_dropout=0.2))
model6.add(Dense(128, activation='relu'))
model6.add(Dropout(0.5))  # adding a dropout layer
model6.add(Dense(64, activation='relu'))  # adding another dense layer
# output layer
# in this problem, we have 13 classes, which is why we have 13 nodes at the output layer
# we are using the activation function 'softmax' since it's a multiclass classification problem
model6.add(Dense(13, activation='softmax'))

model6.build(input_shape=(None, max_len))
# compile the model
# loss function is categorical_crossentropy since it is a multi-class classification problem
# using Adam optimizer with a learning rate of 0.001
# Accuracy metric is used for evaluating the model's performance
model6.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

model6.summary()



Note: A total of 4,833,129 trainable parameters.

In [85]:
history_6=model6.fit(x_train, y_train,epochs=20, batch_size=32, validation_data=(x_test, y_test), verbose=2)

Epoch 1/20
1000/1000 - 50s - 50ms/step - accuracy: 0.2144 - loss: 2.1689 - val_accuracy: 0.2083 - val_loss: 2.1521
Epoch 2/20
1000/1000 - 74s - 74ms/step - accuracy: 0.2153 - loss: 2.1523 - val_accuracy: 0.2175 - val_loss: 2.1548
Epoch 3/20
1000/1000 - 44s - 44ms/step - accuracy: 0.2202 - loss: 2.1315 - val_accuracy: 0.2175 - val_loss: 2.1518
Epoch 4/20
1000/1000 - 79s - 79ms/step - accuracy: 0.2807 - loss: 2.0347 - val_accuracy: 0.2175 - val_loss: 2.1772
Epoch 5/20
1000/1000 - 41s - 41ms/step - accuracy: 0.3131 - loss: 1.9769 - val_accuracy: 0.2175 - val_loss: 2.1632
Epoch 6/20
1000/1000 - 81s - 81ms/step - accuracy: 0.3290 - loss: 1.9495 - val_accuracy: 0.2176 - val_loss: 2.1675
Epoch 7/20
1000/1000 - 42s - 42ms/step - accuracy: 0.3377 - loss: 1.9322 - val_accuracy: 0.2189 - val_loss: 2.1762
Epoch 8/20
1000/1000 - 39s - 39ms/step - accuracy: 0.3439 - loss: 1.9201 - val_accuracy: 0.2176 - val_loss: 2.1710
Epoch 9/20
1000/1000 - 41s - 41ms/step - accuracy: 0.3497 - loss: 1.9089 - val_a

In [87]:
# testing on un seen data
x_sample='I want to kill you'
x_sample=preprocess_text(x_sample)
x_sample=tokenizer.texts_to_sequences([x_sample])
x_sample=pad_sequences(x_sample,maxlen=max_len,padding='post')
y_pred = (model6.predict(x_sample) > 0.5).astype('int32')
# Get the index of the predicted class, to get the class no
predicted_index = y_pred.argmax(axis=-1)[0]
# Inverse transform to get the class name
predicted_class = encoder_obj.inverse_transform([predicted_index])[0]
print(f"Predicted class: {predicted_class}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Predicted class: anger


# 5. comaparison of traditional & Deep learning model

The Deep learning model accuracy is more compared to traditional model.

1. Traditional model accuracy= 0.32 </br>

2. Deep learning model LSTM accuracy=0.40</br>

3. Deep learning model GRU accuracy=0.37</br>

LSTM got better accuracy compared to the rest. GRU overfits the data, because training accuracy very high compared to test accuracy.

Traing process is slow for deep learning compared to traditional model.