**Importing necessary libraries**

In [25]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from keras.layers import Dense, LSTM, Embedding
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer





# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#Reading the dataset
df=pd.read_csv('/content/tweet_emotions.csv')

In [3]:
#Top 5 rows of the dataset
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [4]:
#Checking information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


In [5]:
#Checking for null values
df.isnull().sum()

tweet_id     0
sentiment    0
content      0
dtype: int64

**Preprocessing**

In [6]:
#To lower case
df['processed_content'] = df['content'].str.lower()

In [7]:
#Removing URLs
df['processed_content'] = df['processed_content'].apply(lambda text: re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE))


In [8]:
#Removing special characters
df['processed_content'] = df['processed_content'].apply(lambda text: re.sub(r'[^A-Za-z0-9\s]', '', text))
df['processed_content'] = df['processed_content'].apply(lambda text: re.sub(r'\d', '', text))


In [9]:
#Removing numbers
df['processed_content'] = df['processed_content'].apply(lambda text: re.sub(r'\d', '', text))


In [10]:
# Tokenization
df['processed_content'] = df['processed_content'].apply(lambda text: word_tokenize(text))

In [11]:
# Removing stopwords
stop_words = set(stopwords.words('english'))
df['processed_content'] = df['processed_content'].apply(lambda tokens: [word for word in tokens if word not in stop_words])


In [12]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
df['processed_content'] = df['processed_content'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

In [13]:
# Processed text or content
df['processed_content'] = df['processed_content'].apply(lambda tokens: ' '.join(tokens))


In [14]:
df.head()

Unnamed: 0,tweet_id,sentiment,content,processed_content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...,tiffanylue know listenin bad habit earlier sta...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhhwaitin call
2,1956967696,sadness,Funeral ceremony...gloomy friday...,funeral ceremonygloomy friday
3,1956967789,enthusiasm,wants to hang out with friends SOON!,want hang friend soon
4,1956968416,neutral,@dannycastillo We want to trade with someone w...,dannycastillo want trade someone houston ticke...


In [19]:
#Split data into training and test sets
X = df['processed_content']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
#TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

**Modelling**

**SVM Model**

In [24]:

from sklearn.svm import SVC
#Initializing SVM Model
svm_model = SVC(kernel='linear', random_state=42)

#Training the model
svm_model.fit(X_train_tfidf, y_train)

#Predicting
y_pred_svm = svm_model.predict(X_test_tfidf)

#Evaluating performance
print("SVM Model:")
print(classification_report(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))


SVM Model:


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.00      0.00      0.00       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.11      0.01      0.03       338
   happiness       0.32      0.39      0.35      1028
        hate       0.45      0.20      0.28       268
        love       0.50      0.39      0.44       762
     neutral       0.34      0.56      0.42      1740
      relief       0.32      0.02      0.04       352
     sadness       0.37      0.24      0.29      1046
    surprise       0.37      0.04      0.08       425
       worry       0.34      0.47      0.39      1666

    accuracy                           0.35      8000
   macro avg       0.24      0.18      0.18      8000
weighted avg       0.33      0.35      0.31      8000

Accuracy: 0.348875


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**LSTM Model**

In [130]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=100))
model.add(LSTM(units=128))
model.add(Dense(units=64, activation='relu'))

model.add(Dense(units=len(label_dict), activation='softmax'))


model.summary()

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_20 (Embedding)    (None, None, 100)         4738700   
                                                                 
 lstm_30 (LSTM)              (None, 128)               117248    
                                                                 
 dense_36 (Dense)            (None, 64)                8256      
                                                                 
 dense_37 (Dense)            (None, 13)                845       
                                                                 
Total params: 4865049 (18.56 MB)
Trainable params: 4865049 (18.56 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [131]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [135]:


model.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_test, y_test), verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x79b43b032590>

**RESULT**
The accuracy of the SVM Model is 0.34 and average accuracy of LSTM is 0.27. The accuracy of both the models are less.