In [1]:
%load_ext autoreload
%autoreload 2

# Use HuggingFace's datasets library to access the Emotion dataset
from datasets import load_dataset
import numpy as np

In [2]:
cache_dir = "./data_cache"

train_dataset = load_dataset(
    "tweet_eval",
    name="emotion",
    split="train",
    cache_dir=cache_dir,
)
print(f"Training dataset with {len(train_dataset)} instances loaded")


val_dataset = load_dataset(
    "tweet_eval",
    name="emotion",
    split="validation",
    cache_dir=cache_dir,
)
print(f"Development/validation dataset with {len(val_dataset)} instances loaded")


test_dataset = load_dataset(
    "tweet_eval",
    name="emotion",
    split="test",
    cache_dir=cache_dir,
)
print(f"Test dataset with {len(test_dataset)} instances loaded")

# Access the input text and target labels like this...
train_texts = train_dataset['text']
train_labels = train_dataset['label']

val_texts = val_dataset['text']
val_labels = val_dataset['label']

test_texts = test_dataset['text']
test_labels = test_dataset['label']

Downloading readme:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading data: 100%|███████████████████████| 233k/233k [00:00<00:00, 929kB/s]
Downloading data: 100%|███████████████████████| 105k/105k [00:00<00:00, 764kB/s]
Downloading data: 100%|█████████████████████| 28.6k/28.6k [00:00<00:00, 209kB/s]


Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

Training dataset with 3257 instances loaded
Development/validation dataset with 374 instances loaded
Test dataset with 1421 instances loaded


## Preprocessing

In [3]:
from datasets import load_dataset
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer


# Stopwords and lemmatizer setup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess texts
def preprocess_texts(texts):
    processed_texts = []
    for text in texts:
        text = text.lower()  # Lowercase
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
        text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\d+', '', text)  # Remove numbers
        tokens = word_tokenize(text)  # Tokenize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if not word in stop_words and word not in ['amp', 'u']]  # Lemmatize and remove stopwords
        processed_texts.append(' '.join(tokens))
    return processed_texts

# Preprocess all training texts
train_texts = train_dataset['text']
processed_train_texts = preprocess_texts(train_texts)

# Flatten the list of processed texts into a single list of tokens
all_tokens = [token for text in processed_train_texts for token in text.split()]

# Create a frequency distribution of all tokens
freq_dist = FreqDist(all_tokens)

# Print the 50 most common words
most_common_words = freq_dist.most_common(50)
print(most_common_words)

[('im', 300), ('like', 229), ('dont', 207), ('get', 182), ('people', 142), ('one', 122), ('day', 108), ('know', 106), ('time', 104), ('cant', 102), ('make', 102), ('think', 99), ('got', 95), ('go', 95), ('love', 91), ('sad', 91), ('want', 90), ('would', 89), ('life', 88), ('really', 87), ('even', 83), ('feel', 80), ('back', 79), ('going', 74), ('need', 74), ('still', 73), ('good', 71), ('today', 70), ('fear', 70), ('see', 70), ('someone', 68), ('thats', 67), ('angry', 67), ('depression', 66), ('thing', 66), ('never', 65), ('look', 65), ('say', 65), ('fucking', 63), ('way', 63), ('much', 63), ('lost', 62), ('watch', 61), ('year', 61), ('work', 61), ('ive', 60), ('right', 60), ('new', 59), ('anger', 58), ('sadness', 56)]


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer(max_features=3000)  # Limiting to 1000 features

# Fit the vectorizer to the training data and transform the texts
X_train = vectorizer.fit_transform(processed_train_texts)
y_train = np.array(train_labels)

# Similarly, transform the validation and test sets
X_val = vectorizer.transform(preprocess_texts(val_texts))
y_val = np.array(val_labels)

X_test = vectorizer.transform(preprocess_texts(test_texts))
y_test = np.array(test_labels)

## Naive Bayes

In [5]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the model
nb_classifier.fit(X_train, y_train)

In [6]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on the validation set
val_predictions = nb_classifier.predict(X_val)

# Evaluate the model
print("Validation Set Evaluation:")
print(classification_report(y_val, val_predictions))

Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.69      0.81      0.75       160
           1       0.65      0.53      0.58        97
           2       0.36      0.18      0.24        28
           3       0.58      0.63      0.61        89

    accuracy                           0.64       374
   macro avg       0.57      0.53      0.54       374
weighted avg       0.63      0.64      0.63       374



In [7]:
# Predict on the test set
test_predictions = nb_classifier.predict(X_test)

# Evaluate and print the final results on the test data
print("Test Set Evaluation:")
print(classification_report(y_test, test_predictions))

Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.67      0.84      0.75       558
           1       0.73      0.59      0.65       358
           2       0.51      0.26      0.34       123
           3       0.66      0.65      0.65       382

    accuracy                           0.67      1421
   macro avg       0.64      0.58      0.60      1421
weighted avg       0.67      0.67      0.66      1421

