# **Begin**

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("praveengovi/emotions-dataset-for-nlp")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/praveengovi/emotions-dataset-for-nlp/versions/1


In [None]:
import pandas as pd

train_file = pd.read_csv(path + '/train.txt', sep = ';', names=['text', 'emotion'])
val_file = pd.read_csv(path + '/val.txt', sep = ';', names=['text', 'emotion'])

train_file.head(5)

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [None]:
train_file['emotion'].value_counts()

Unnamed: 0_level_0,count
emotion,Unnamed: 1_level_1
joy,5362
sadness,4666
anger,2159
fear,1937
love,1304
surprise,572


In [None]:
train_file = train_file[~train_file['emotion'].str.contains('love')]
train_file = train_file[~train_file['emotion'].str.contains('surprise')]

In [None]:
val_file = val_file[~val_file['emotion'].str.contains('love')]
val_file = val_file[~val_file['emotion'].str.contains('surprise')]

In [None]:
train_file['emotion'].value_counts()

Unnamed: 0_level_0,count
emotion,Unnamed: 1_level_1
joy,5362
sadness,4666
anger,2159
fear,1937


# **Preprocessing**

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from textblob import TextBlob

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

class PreProcessor:
    def __init__(self, delete_not_letters=False, lower=False, delete_stop_word=False,
                 lemmatization=False, stemming=False, tokenizing = False, tfIdf = False, countVectorizing = False):
      self.lower = lower
      self.lemmatization = lemmatization
      self.stemming = stemming
      self.tokenizing = tokenizing
      self.tfIdf = tfIdf
      assert not (lemmatization and stemming), 'You must choose either lemmatization or stemming'
      self.delete_not_letters = delete_not_letters
      self.delete_stop_word = delete_stop_word
      self.countVectorizing = countVectorizing

      self.stop_words = set(stopwords.words('english')) if delete_stop_word else None
      self.lemmatizer = WordNetLemmatizer() if lemmatization else None
      self.stemmer = PorterStemmer() if stemming else None
      self.tokenizer = None
      self.tf_idf_vectorizer = None
      self.count_vectorizer = None
      self.max_length = 0

    def process_text(self, text):
      words = text.split()

      processed_words = []
      for word in words:
          if self.lower:
              word = word.lower()
          if self.delete_stop_word and word in self.stop_words:
              continue
          if self.delete_not_letters:
              word = re.sub(r'[^a-zA-Z]', '', word)
              if not word:
                  continue
          if self.lemmatization:
              word = self.lemmatizer.lemmatize(word)
          elif self.stemming:
              word = self.stemmer.stem(word)

          processed_words.append(word)

      proccesed_text = ' '.join(processed_words)

      return proccesed_text

    def process_dataset(self, dataset, mode = 'train'):
      processed_dataset = [self.process_text(text) for text in dataset]

      assert (mode == 'train' or mode == 'val'), 'mode must be a \'train\' or \'val\''
      if self.tokenizing:
        if mode == 'train':
          self.tokenizer = Tokenizer(num_words = 1000)
          self.tokenizer.fit_on_texts(processed_dataset)

          processed_dataset = self.tokenizer.texts_to_sequences(processed_dataset)

          self.max_length = max([len(text) for text in processed_dataset])

          processed_dataset = pad_sequences(processed_dataset, padding="pre", maxlen=self.max_length)

          return torch.Tensor(processed_dataset).int()
        else:
          assert self.tokenizer != None, 'you must run with train mode firstly'

          processed_dataset = self.tokenizer.texts_to_sequences(processed_dataset)
          processed_dataset = pad_sequences(processed_dataset, padding="pre", maxlen=self.max_length)
          return torch.Tensor(processed_dataset).int()
      elif self.tfIdf:
        if mode == 'train':
          self.tf_idf_vectorizer = TfidfVectorizer(max_features = 3000)
          processed_dataset = self.tf_idf_vectorizer.fit_transform(processed_dataset)
          return processed_dataset
        else:
          assert self.tf_idf_vectorizer != None, 'you must run with train mode firstly'
          processed_dataset = self.tf_idf_vectorizer.transform(processed_dataset)
          return processed_dataset
      elif self.countVectorizing:
        if mode == 'train':
          self.count_vectorizer = CountVectorizer(max_features=3000)
          processed_dataset = self.count_vectorizer.fit_transform(processed_dataset)
          return processed_dataset
        else:
          assert self.count_vectorizer != None, 'you must run with train mode firstly'
          processed_dataset = self.count_vectorizer.transform(processed_dataset)
          return processed_dataset
      else:
        return processed_dataset

In [None]:
X = train_file['text']
y = train_file['emotion']

X_val = val_file['text']
y_val = val_file['emotion']

In [None]:
pre_processor = PreProcessor(lower = True, lemmatization=True, delete_stop_word=True, delete_not_letters = True, tokenizing = True)

X_proccesed = pre_processor.process_dataset(X)
X_val_proccesed = pre_processor.process_dataset(X_val, mode = 'val')

In [None]:
X_proccesed[0:4]

tensor([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  51,   1,
         499],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,  30,   2, 326, 386,  44,  61,
         182],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   4, 430, 137,   1, 327,
         295],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,
         850]], dtype=torch.int32)

In [None]:
model = BERT_based_classificator()

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y_encoded = encoder.fit_transform(y)
y_encoded = torch.Tensor(y_encoded)

y_val_encoded = encoder.transform(y_val)
y_val_encoded = torch.Tensor(y_val_encoded)

In [None]:
X_processed = model.tokenize(X)

In [None]:
X_processed = tokenizer(list(X), return_tensors = 'pt', padding = True)

In [None]:
X_processed["input_ids"][0:10]

In [None]:
answers = model(input_ids = X_processed["input_ids"][0:10], attention_mask = X_processed["attention_mask"][0:10])

In [None]:
answers

BaseModelOutput(last_hidden_state=tensor([[[-0.1168,  0.0986, -0.1296,  ...,  0.0587,  0.3543,  0.4042],
         [ 0.1325,  0.1516, -0.1169,  ..., -0.1119,  0.5562,  0.2908],
         [-0.1053,  0.2862,  0.1958,  ...,  0.0241,  0.0577, -0.3627],
         ...,
         [-0.1576,  0.0898, -0.0468,  ...,  0.0662, -0.0555, -0.3040],
         [-0.1641,  0.0944, -0.0331,  ...,  0.0501, -0.0445, -0.3198],
         [-0.2448,  0.1373,  0.3082,  ..., -0.0214,  0.1674, -0.1861]],

        [[-0.0324, -0.0323, -0.1957,  ..., -0.1747,  0.3546,  0.3028],
         [ 0.2920,  0.2514, -0.3510,  ..., -0.0705,  0.4476,  0.3433],
         [ 0.1124,  0.2768, -0.1371,  ..., -0.3548,  0.6641,  0.0977],
         ...,
         [-0.0134,  0.1927, -0.1303,  ..., -0.0439,  0.1111, -0.1301],
         [-0.0230,  0.1415, -0.1237,  ...,  0.0382,  0.0511, -0.1063],
         [-0.0700,  0.0209, -0.0227,  ...,  0.0192,  0.0657, -0.1047]],

        [[ 0.0397,  0.2022,  0.1423,  ..., -0.1141,  0.3394,  0.3958],
         [-

# **LSTM**


In [None]:
import torch.nn as nn

class MyLSTM(nn.Module):
  def __init__(self):
    super().__init__()
    self.embed = nn.Embedding(1000, 128)
    self.lstm = nn.LSTM(128, 128, 1, batch_first=True)
    self.dropout = nn.Dropout(p= 0.3)
    self.linear = nn.Linear(128, 6)

  def forward(self, x):
    x = self.embed(x)

    x, (h_n, c_n) = self.lstm(x)

    x = x[:, -1, :]

    x = self.dropout(x)

    x = self.linear(x)
    return x

# **Training LSTM**


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder()

y_encoded = label_encoder.fit_transform(y)
y_encoded = torch.Tensor(y_encoded)

y_val_encoded = label_encoder.transform(y_val)
y_val_encoded = torch.Tensor(y_val_encoded)


In [None]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_proccesed, y_encoded)
val_dataset = TensorDataset(X_val_proccesed, y_val_encoded)


batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle = True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
X_proccesed.shape, y_encoded.shape

In [None]:
import torch.optim as optim

model = MyLSTM()

loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [None]:
import numpy as np

def getPredict(output):
  return torch.argmax(output, dim=1)

In [None]:
num_epoch = 10

for epoch in range(num_epoch):
    running_loss = 0.0
    correct_predictions_train = 0
    total_predictions_train = 0

    model.train()
    for data, label in train_dataloader:
        data, label = data.int(), label.long()
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, label)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        labels_pred_train = getPredict(output)
        correct_predictions_train += (labels_pred_train == label).sum().item()
        total_predictions_train += label.size(0)

    train_accuracy = correct_predictions_train / total_predictions_train

    model.eval()
    valid_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            inputs, labels = inputs.int(), labels.long()
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            valid_loss += loss.item()

            labels_pred = getPredict(outputs)
            correct_predictions += (labels_pred == labels).sum().item()
            total_predictions += labels.size(0)

    valid_loss /= len(val_dataloader)
    accuracy = correct_predictions / total_predictions

    print(f"Epoch {epoch+1}/{num_epoch}\nTraining Loss: {running_loss:.4f}\nTraining Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {valid_loss:.4f}\nValidation Accuracy: {accuracy:.4f}")

# **countVectorizing**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

pre_processor = PreProcessor(delete_not_letters=True, lower=True, delete_stop_word=True, lemmatization=True, countVectorizing=True)

X_processed = pre_processor.process_dataset(X, mode = 'train')
X_val_processed = pre_processor.process_dataset(X_val, mode = 'val')

In [None]:
svm = SVC(kernel="linear",gamma=1, C=.5, random_state=42)

svm.fit(X_processed, y)

y_pred = svm.predict(X_val_processed)

accuracy = accuracy_score(y_val, y_pred)

print(accuracy)

0.9287765651924181


In [None]:
lgr = LogisticRegression()

lgr.fit(X_processed, y)

y_pred = lgr.predict(X_val_processed)

accuracy = accuracy_score(y_val, y_pred)

print(accuracy)

0.9316484778862723


In [None]:
rfc = RandomForestClassifier()

rfc.fit(X_processed, y)

y_pred = rfc.predict(X_val_processed)

accuracy = accuracy_score(y_val, y_pred)

print(accuracy)

0.9287765651924181


In [None]:
from sklearn.ensemble import VotingClassifier

estimators=[
        ("svm", svm),
        ("lgr", lgr),
        ("rfc", rfc)]

voting_classifier = VotingClassifier(estimators, voting='hard')

voting_classifier.fit(X_processed, y)

y_pred = voting_classifier.predict(X_val_processed)

accuracy = accuracy_score(y_val, y_pred)

In [None]:
accuracy

0.9327972429638139