In [68]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
from nltk.classify import NaiveBayesClassifier
from nltk.probability import FreqDist
#from nltk.classify.util import accuracy
import string
import random


from sklearn.model_selection import train_test_split
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

import pickle

In [2]:
train_data = pd.read_csv('TRAIN_DATA.csv')
test_data = pd.read_csv('TEST_DATA.csv')
val_data = pd.read_csv('VAL_DATA.csv')

In [3]:
train_data.head()

Unnamed: 0,Sentence,Emotion,Length
0,My favourite food is anything I didn't have to...,6,59
1,"Now if he does off himself, everyone will thin...",6,112
2,WHY THE FUCK IS BAYLESS ISOING,0,30
3,To make her feel threatened,2,27
4,Dirty Southern Wankers,0,22


# Processing data

In [5]:
def preprocess(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha() and token not in stopwords]
    return tokens
    

In [10]:
def text_to_features(text):
    tokens = preprocess(text)
    return {word: True for word in tokens}

# NLTK Naive bayes

In [15]:
train_data_1 = [(text_to_features(text), label) for text, label in zip(train_data['Sentence'], train_data['Emotion'])]
test_data_1 = [(text_to_features(text), label) for text, label in zip(test_data['Sentence'], test_data['Emotion'])]
val_data_1 = [(text_to_features(text), label) for text, label in zip(val_data['Sentence'], val_data['Emotion'])]

In [16]:
nb_classifier = NaiveBayesClassifier.train(train_data_1)

In [17]:
accuracy = nltk.classify.accuracy(nb_classifier,val_data_1)
print("Accuracy:", accuracy)

Accuracy: 0.427965320051651


In [61]:
with open('nltk_naive_bayes_model_42.pkl', 'wb') as model_file:
    pickle.dump(nb_classifier, model_file)

# _____________________________________________________________

In [21]:
# Preprocessing: Convert to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(train_data['Sentence'])
X_val_tfidf = tfidf.transform(val_data['Sentence'])

In [23]:
# Converting sparse matrix to dictionary format for NaiveBayesClassifier
train_data_2 = [(dict(zip(tfidf.get_feature_names_out(), X_train_tfidf[i].toarray()[0])), train_data['Emotion'].iloc[i]) for i in range(X_train_tfidf.shape[0])]
val_data_2 = [(dict(zip(tfidf.get_feature_names_out(), X_val_tfidf[i].toarray()[0])), val_data['Emotion'].iloc[i]) for i in range(X_val_tfidf.shape[0])]
#test_data_2 = [(dict(zip(tfidf.get_feature_names_out(), X_val_tfidf[i].toarray()[0])), y_val.iloc[i]) for i in range(X_val_tfidf.shape[0])]


In [24]:
# Train the Naive Bayes model --> multicalss
nb_classifier_2 = NaiveBayesClassifier.train(train_data_2)

In [55]:
import numpy as np

def calculate_accuracy(model, data, labels):
    correct = 0
    total = data.shape[0]
    for i in range(total):
        # Convert the sparse row to dense array and then to a dictionary
        dense_row = data[i].toarray().flatten()
        features = {f'feature_{j}': dense_row[j] for j in range(len(dense_row))}
        
        prediction = model.classify(features)  # Use the dictionary as input
        if prediction == labels[i]:
            correct += 1
    return correct / total

# Use the function and pass the labels as well
val_accuracy = calculate_accuracy(nb_classifier_2, X_train_tfidf, train_data['Emotion'].values)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")


Validation Accuracy: 38.95%


In [63]:
with open('nltk_2_naive_bayes_model_40.pkl', 'wb') as model_file:
    pickle.dump(nb_classifier_2, model_file)

# ___________________________________________________________________________

In [66]:
# Sklearn naive bayes

In [70]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, train_data['Emotion'])

In [72]:
y_pred = nb_model.predict(X_val_tfidf)

In [76]:
accuracy = accuracy_score(val_data['Emotion'], y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(val_data['Emotion'], y_pred))

Accuracy: 0.5242575170632725
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.15      0.24       684
           1       0.50      0.01      0.03        74
           2       1.00      0.05      0.09        85
           3       0.56      0.85      0.68      2165
           4       0.78      0.07      0.13       299
           5       0.61      0.06      0.11       522
           6       0.45      0.53      0.49      1592

    accuracy                           0.52      5421
   macro avg       0.64      0.25      0.25      5421
weighted avg       0.55      0.52      0.46      5421



In [78]:
with open('sklearn_naive_bayes_model_52.pkl', 'wb') as model_file:
    pickle.dump(nb_model, model_file)

## NLTK SKlearn Based