# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

# Loading the dataset

In [2]:
data = pd.read_csv('tweet_emotions.csv')
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [3]:
data.shape

(40000, 3)

# Data Preprocessing

In [4]:
# First looking for null values

data.isna().sum()

tweet_id     0
sentiment    0
content      0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


In [6]:
# Cleaning the dataset by using gensim that can convert raw text into a suitable format

import gensim
data['content'] = data['content'].apply(lambda x:gensim.utils.simple_preprocess(x))

In [7]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,"[tiffanylue, know, was, listenin, to, bad, hab..."
1,1956967666,sadness,"[layin, bed, with, headache, ughhhh, waitin, o..."
2,1956967696,sadness,"[funeral, ceremony, gloomy, friday]"
3,1956967789,enthusiasm,"[wants, to, hang, out, with, friends, soon]"
4,1956968416,neutral,"[dannycastillo, we, want, to, trade, with, som..."


In [8]:
data.tweet_id.nunique()

40000

In [9]:
data.sentiment.unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [10]:
data.sentiment.value_counts()

sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

In [11]:
# encoding the sentiment column

def custom_encoder(df):
    df.replace(to_replace=['happiness','love','surprise','fun','relief','enthusiasm'],value=2, inplace = True)
    df.replace(to_replace=['neutral','empty'],value=1, inplace = True)
    df.replace(to_replace=['worry','sadness','hate','boredom','anger'],value=0, inplace = True)
    return df

In [12]:
data['sentiment'] = custom_encoder(data.sentiment)

In [13]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,1,"[tiffanylue, know, was, listenin, to, bad, hab..."
1,1956967666,0,"[layin, bed, with, headache, ughhhh, waitin, o..."
2,1956967696,0,"[funeral, ceremony, gloomy, friday]"
3,1956967789,2,"[wants, to, hang, out, with, friends, soon]"
4,1956968416,1,"[dannycastillo, we, want, to, trade, with, som..."


In [17]:
data.sentiment.value_counts()

sentiment
2    15299
0    15236
1     9465
Name: count, dtype: int64

In [15]:
# here 0 is negative sentiment and 
# 2 is positive sentiment and 
# 1 means neither positive or negative

# Feature Extraction

In [18]:
# Term Frequency-Inverse Document Frequency (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
x = tfidf.fit_transform(data.content.astype(str))

y = data.sentiment

In [19]:
x

<40000x45779 sparse matrix of type '<class 'numpy.float64'>'
	with 473100 stored elements in Compressed Sparse Row format>

In [20]:
y

0        1
1        0
2        0
3        2
4        1
        ..
39995    1
39996    2
39997    2
39998    2
39999    2
Name: sentiment, Length: 40000, dtype: int64

In [21]:
x.shape

(40000, 45779)

In [22]:
y.shape

(40000,)

In [23]:
# splitting data

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

# Model building

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
rfc = RandomForestClassifier(n_estimators = 100, random_state = 42)
rfc.fit(x_train,y_train)

# Evaluation

In [26]:
from sklearn.metrics import accuracy_score

In [27]:
y_pred = rfc.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy = ",accuracy)

Accuracy =  0.58


# Another model

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['content'])
sequences = tokenizer.texts_to_sequences(data['content'])

In [29]:
# Pad the sequences
maxlen = 100
X_seq = pad_sequences(sequences, maxlen=maxlen)

In [30]:
# Convert labels to categorical format
y_cat = pd.get_dummies(data['sentiment']).values

In [31]:
# Split the data
X_train_seq, X_test_seq, y_train_cat, y_test_cat = train_test_split(X_seq, y_cat, test_size=0.2, random_state=42)

In [34]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=y_cat.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(X_train_seq, y_train_cat, epochs=5, batch_size=64, validation_data=(X_test_seq, y_test_cat))

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 114ms/step - accuracy: 0.4975 - loss: 1.0017 - val_accuracy: 0.5901 - val_loss: 0.9052
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 118ms/step - accuracy: 0.6344 - loss: 0.8372 - val_accuracy: 0.5979 - val_loss: 0.8952
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 118ms/step - accuracy: 0.6685 - loss: 0.7836 - val_accuracy: 0.5895 - val_loss: 0.9085
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 119ms/step - accuracy: 0.6904 - loss: 0.7302 - val_accuracy: 0.5865 - val_loss: 0.9472
Epoch 5/5
[1m480/500[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m2s[0m 109ms/step - accuracy: 0.7100 - loss: 0.6779

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_seq, y_test_cat)
print("LSTM Model Accuracy:", accuracy)