In [2]:
#IMPORTING THE LIBRARIES
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import regularizers
#for deep learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM, Dense,Dropout,Bidirectional
#for traditional ml
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report

In [3]:
#LOADING THE DATA

In [4]:
data=pd.read_csv('/content/tweet_emotions.csv')

In [5]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [6]:
#PREPROCESSING

In [7]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


In [9]:
#removing the tweet id column
data=data.drop('tweet_id',axis=1)

In [10]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [11]:
data.describe()

Unnamed: 0,sentiment,content
count,40000,40000
unique,13,39827
top,neutral,I just received a mothers day card from my lov...
freq,8638,14


In [12]:
#initializing the tools
lemmatizer = WordNetLemmatizer()

In [13]:
#finding the uniqure characters in code
unique_characters = set(''.join(data['content']))
print(unique_characters)

{'\\', 'W', "'", '^', '4', 'P', '}', 'u', '#', 'M', '(', 'X', 'i', 'V', 'c', 'H', 'R', '{', 'r', 'w', 't', '=', '?', ',', '6', '\xa0', 'f', ' ', '½', 'N', '/', '_', 'Â', ';', 'g', 'ï', 'v', '`', 'a', 'F', 'Z', ')', '\t', 'k', '0', '7', 'E', '´', 'J', 'l', 'p', 'O', 'z', '¿', 'S', '@', 'n', 'U', 'T', 'I', '2', 'Y', 'D', 'y', 's', '8', '-', 'G', '$', 'C', 'x', '!', 'b', ']', '9', '+', '&', 'j', 'd', 'h', '.', '*', '|', 'L', ':', 'q', 'B', '3', 'Q', '1', '%', '[', '¡', 'K', '~', 'o', 'e', 'm', '5', 'A'}


In [14]:
#defining a preprocessing function
def preprocess_tweets(content):
  #converting into lowercase
  content = content.lower()
  #removing the special characters
  content = re.sub(r'[@#]\w+', '', content)
  content = re.sub(r'[^a-z\s]', '', content)
  content = re.sub(r'\s+', ' ', content).strip()
  #tokenizing the word
  tokens = word_tokenize(content)
  #stop word removal
  tokens = [word for word in tokens if word not in stopwords.words('english')]
  # Lemmatizing the content
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  return " ".join(tokens)


In [15]:
data['cleaned_content'] = data['content'].apply(preprocess_tweets)

In [16]:
data.isna().sum()#no missing values

Unnamed: 0,0
sentiment,0
content,0
cleaned_content,0


In [17]:
data.head()

Unnamed: 0,sentiment,content,cleaned_content
0,empty,@tiffanylue i know i was listenin to bad habi...,know listenin bad habit earlier started freaki...
1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhhwaitin call
2,sadness,Funeral ceremony...gloomy friday...,funeral ceremonygloomy friday
3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon
4,neutral,@dannycastillo We want to trade with someone w...,want trade someone houston ticket one


In [18]:
data['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [19]:
#removing the null value by considering the 'empty' as null value
if 'sentiment' in data.columns:
    mode = data.loc[data['sentiment'] != 'empty', 'sentiment'].mode()[0]
    data['sentiment'].replace('empty', mode, inplace=True)
print(data.head())

    sentiment                                            content  \
0     neutral  @tiffanylue i know  i was listenin to bad habi...   
1     sadness  Layin n bed with a headache  ughhhh...waitin o...   
2     sadness                Funeral ceremony...gloomy friday...   
3  enthusiasm               wants to hang out with friends SOON!   
4     neutral  @dannycastillo We want to trade with someone w...   

                                     cleaned_content  
0  know listenin bad habit earlier started freaki...  
1             layin n bed headache ughhhhwaitin call  
2                      funeral ceremonygloomy friday  
3                              want hang friend soon  
4              want trade someone houston ticket one  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['sentiment'].replace('empty', mode, inplace=True)


In [20]:
#encoding the sentiment labels
#using label encoding as there is multiple sentiments
label_encoder = LabelEncoder()
data['sentiment_encoded'] = label_encoder.fit_transform(data['sentiment'])

In [21]:
data.head()

Unnamed: 0,sentiment,content,cleaned_content,sentiment_encoded
0,neutral,@tiffanylue i know i was listenin to bad habi...,know listenin bad habit earlier started freaki...,7
1,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhhwaitin call,9
2,sadness,Funeral ceremony...gloomy friday...,funeral ceremonygloomy friday,9
3,enthusiasm,wants to hang out with friends SOON!,want hang friend soon,2
4,neutral,@dannycastillo We want to trade with someone w...,want trade someone houston ticket one,7


#FEATURE EXTRACTION

In [22]:
#feature extraction for traditional ml models
vectorizer = TfidfVectorizer()
x_vectorized=vectorizer.fit_transform(data['cleaned_content']).toarray()

In [23]:
#feature extraction for deep learning models
tokenizer = Tokenizer(num_words = 10000 , oov_token='<OOV>')
tokenizer.fit_on_texts(data['cleaned_content'])
sequences = tokenizer.texts_to_sequences(data['cleaned_content'])
padded_seq = pad_sequences(sequences,maxlen = 50, padding='post')


In [24]:
x= padded_seq
y=tf.keras.utils.to_categorical(data['sentiment_encoded'],num_classes=12)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [25]:
#splitting the dataset
x_train,x_test,y_train,y_test=train_test_split(x, y,test_size=0.2,random_state=42)

#DEFINING THE LSTM MODEL


In [26]:
model = tf.keras.Sequential([
    Embedding(input_dim=10000, output_dim=128),  # Embedding layer without regularization
    Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=regularizers.l2(0.001))),  # L2 regularization on LSTM layer
    Dropout(0.2),
    Bidirectional(LSTM(64, kernel_regularizer=regularizers.l2(0.001))),  # L2 regularization on second LSTM layer
    Dropout(0.2),
    Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),  # L2 regularization on Dense layer
    Dropout(0.2),
    Dense(12, activation='softmax')  # Output layer with no regularization
])

In [27]:
#compiling
model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])


In [31]:
history = model.fit(x_train,y_train,epochs=1,validation_data=(x_test,y_test))

[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 325ms/step - accuracy: 0.2491 - loss: 2.3177 - val_accuracy: 0.3122 - val_loss: 1.9531


In [1]:
#splitting the data for traditional model
X_train,X_test,y_train,y_test=train_test_split(x_vectorized,data['sentiment_encoded'],test_size=0.2,random_state=42)

NameError: name 'train_test_split' is not defined

#TRADITIONAL MODEL

In [None]:
# Random Forest Classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_predictions))

In [None]:
# support vector machine
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))


#EVALUATION OF THE MODELS