In [12]:
import pandas as pd
from datasets import load_dataset
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
nltk.download('stopwords')
nltk.download('wordnet')

print("Downloading dataset...")
dataset = load_dataset("boltuix/emotions-dataset", split='train')
df = dataset.to_pandas()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/daniel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Downloading dataset...


Generating train split: 100%|â–ˆ| 131306/131306 [00:00<00:00, 2260306.67 examples/


In [21]:
print("FIRST 5 ROWS:")
print(df.head())

print("\nDATASET INFORMATION:")
print(df.info())

print("\nLABEL DISTRIBUTION:")
print(df["Label"].value_counts())

FIRST 5 ROWS:
                                            Sentence      Label
0  Unfortunately later died from eating tainted m...  happiness
1  Last time I saw was loooong ago. Basically bef...    neutral
2  You mean by number of military personnel? Beca...    neutral
3  Need to go middle of the road no NAME is going...    sadness
4           feel melty miserable enough imagine must    sadness

DATASET INFORMATION:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131306 entries, 0 to 131305
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Sentence  131306 non-null  object
 1   Label     131306 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB
None

LABEL DISTRIBUTION:
Label
happiness    31205
sadness      17809
neutral      15733
anger        13341
love         10512
fear          8795
disgust       8407
confusion     8209
surprise      4560
shame         4248
guilt         3470
sarcasm       2534
desire        

In [31]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence, re.I|re.A)

    tokens = sentence.split(" ")
    cleaned_tokens = []
    for word in tokens:
        if word not in stop_words:
            cleaned_tokens.append(lemmatizer.lemmatize(word))

    return " ".join(cleaned_tokens)

df["Cleaned_Sentence"] = df["Sentence"].apply(preprocess_sentence)

print("ORIGINAL FIRST SENTENCE:")
print(df["Sentence"].iloc[0])

print("\nCLEANED FIRST SENTENCE:")
print(df["Cleaned_Sentence"].iloc[0])

  sentence = re.sub(r'[^a-zA-Z\s]', '', sentence, re.I|re.A)


ORIGINAL FIRST SENTENCE:
Unfortunately later died from eating tainted meat NAME BBC documentary dynasties followed the marsh pride the lion episode was awesome

CLEANED FIRST SENTENCE:
unfortunately later died eating tainted meat name bbc documentary dynasty followed marsh pride lion episode awesome


In [33]:
x = df["Cleaned_Sentence"]
y = df["Label"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
print("Total samples: {}".format(len(x)))
print("Training samples: {}".format(len(x_train)))
print("Testing samples: {}".format(len(x_test)))