In [128]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
from nltk.classify import NaiveBayesClassifier
from nltk.probability import FreqDist
from nltk.classify import accuracy
#from nltk.classify.util import accuracy
import string
import random


from sklearn.model_selection import train_test_split
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB ,GaussianNB ,BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC , LinearSVC 

import pickle

In [3]:
train_data = pd.read_csv('TRAIN_DATA.csv')
test_data = pd.read_csv('TEST_DATA.csv')
val_data = pd.read_csv('VAL_DATA.csv')

In [5]:
train_data.head()

Unnamed: 0,Sentence,Emotion,Length
0,My favourite food is anything I didn't have to...,6,59
1,"Now if he does off himself, everyone will thin...",6,112
2,WHY THE FUCK IS BAYLESS ISOING,0,30
3,To make her feel threatened,2,27
4,Dirty Southern Wankers,0,22


# Processing data

In [7]:
def preprocess(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha() and token not in stopwords]
    return tokens
    

In [9]:
def text_to_features(text):
    tokens = preprocess(text)
    return {word: True for word in tokens}

# NLTK Naive bayes

In [11]:
train_data_1 = [(text_to_features(text), label) for text, label in zip(train_data['Sentence'], train_data['Emotion'])]
test_data_1 = [(text_to_features(text), label) for text, label in zip(test_data['Sentence'], test_data['Emotion'])]
val_data_1 = [(text_to_features(text), label) for text, label in zip(val_data['Sentence'], val_data['Emotion'])]

In [16]:
nb_classifier = NaiveBayesClassifier.train(train_data_1)

In [17]:
accuracy = nltk.classify.accuracy(nb_classifier,val_data_1)
print("Accuracy:", accuracy)

Accuracy: 0.427965320051651


In [61]:
with open('nltk_naive_bayes_model_42.pkl', 'wb') as model_file:
    pickle.dump(nb_classifier, model_file)

# _____________________________________________________________

In [13]:
# Preprocessing: Convert to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(train_data['Sentence'])
X_val_tfidf = tfidf.transform(val_data['Sentence'])

In [15]:
# Converting sparse matrix to dictionary format for NaiveBayesClassifier
train_data_2 = [(dict(zip(tfidf.get_feature_names_out(), X_train_tfidf[i].toarray()[0])), train_data['Emotion'].iloc[i]) for i in range(X_train_tfidf.shape[0])]
val_data_2 = [(dict(zip(tfidf.get_feature_names_out(), X_val_tfidf[i].toarray()[0])), val_data['Emotion'].iloc[i]) for i in range(X_val_tfidf.shape[0])]
#test_data_2 = [(dict(zip(tfidf.get_feature_names_out(), X_val_tfidf[i].toarray()[0])), y_val.iloc[i]) for i in range(X_val_tfidf.shape[0])]


In [24]:
# Train the Naive Bayes model --> multicalss
nb_classifier_2 = NaiveBayesClassifier.train(train_data_2)

In [55]:
import numpy as np

def calculate_accuracy(model, data, labels):
    correct = 0
    total = data.shape[0]
    for i in range(total):
        # Convert the sparse row to dense array and then to a dictionary
        dense_row = data[i].toarray().flatten()
        features = {f'feature_{j}': dense_row[j] for j in range(len(dense_row))}
        
        prediction = model.classify(features)  # Use the dictionary as input
        if prediction == labels[i]:
            correct += 1
    return correct / total

# Use the function and pass the labels as well
val_accuracy = calculate_accuracy(nb_classifier_2, X_train_tfidf, train_data['Emotion'].values)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")


Validation Accuracy: 38.95%


In [63]:
with open('nltk_2_naive_bayes_model_40.pkl', 'wb') as model_file:
    pickle.dump(nb_classifier_2, model_file)

# ___________________________________________________________________________

In [66]:
# Sklearn naive bayes

In [70]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, train_data['Emotion'])

In [72]:
y_pred = nb_model.predict(X_val_tfidf)

In [76]:
accuracy = accuracy_score(val_data['Emotion'], y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(val_data['Emotion'], y_pred))

Accuracy: 0.5242575170632725
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.15      0.24       684
           1       0.50      0.01      0.03        74
           2       1.00      0.05      0.09        85
           3       0.56      0.85      0.68      2165
           4       0.78      0.07      0.13       299
           5       0.61      0.06      0.11       522
           6       0.45      0.53      0.49      1592

    accuracy                           0.52      5421
   macro avg       0.64      0.25      0.25      5421
weighted avg       0.55      0.52      0.46      5421



In [78]:
with open('sklearn_naive_bayes_model_52.pkl', 'wb') as model_file:
    pickle.dump(nb_model, model_file)

### Gaussian

In [20]:
# Convert sparse matrix to dense array
X_train_tfidf_dense = X_train_tfidf.toarray()

# Fit the model with the dense array
Gnb_model = GaussianNB()
Gnb_model.fit(X_train_tfidf_dense, train_data['Emotion'])

In [24]:
y_pred_gnb = Gnb_model.predict(X_val_tfidf.toarray())

In [26]:
accuracy_gnb = accuracy_score(val_data['Emotion'], y_pred_gnb)
print("Accuracy:", accuracy_gnb)
print("Classification Report:\n", classification_report(val_data['Emotion'], y_pred_gnb))

Accuracy: 0.10865154030621657
Classification Report:
               precision    recall  f1-score   support

           0       0.18      0.17      0.18       684
           1       0.01      0.15      0.02        74
           2       0.03      0.28      0.05        85
           3       0.55      0.06      0.10      2165
           4       0.07      0.38      0.12       299
           5       0.13      0.24      0.17       522
           6       0.35      0.05      0.08      1592

    accuracy                           0.11      5421
   macro avg       0.19      0.19      0.10      5421
weighted avg       0.36      0.11      0.11      5421



# ________________________________________________________________________________

In [32]:
Bnb_model = BernoulliNB()
Bnb_model.fit(X_train_tfidf, train_data['Emotion'])

In [34]:
y_pred_Bnb = Bnb_model.predict(X_val_tfidf)

In [36]:
accuracy_Bnb = accuracy_score(val_data['Emotion'], y_pred_Bnb)
print("Accuracy:", accuracy_Bnb)
print("Classification Report:\n", classification_report(val_data['Emotion'], y_pred_Bnb))

Accuracy: 0.5709278730861465
Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.32      0.38       684
           1       0.50      0.05      0.10        74
           2       0.43      0.04      0.07        85
           3       0.68      0.77      0.72      2165
           4       0.57      0.29      0.38       299
           5       0.42      0.22      0.29       522
           6       0.49      0.63      0.55      1592

    accuracy                           0.57      5421
   macro avg       0.51      0.33      0.36      5421
weighted avg       0.56      0.57      0.55      5421



In [40]:
with open('sklearn_BNB_57.pkl', 'wb') as model_file:
    pickle.dump(Bnb_model, model_file)

## NLTK SKlearn Based

In [16]:
train_data_3 = [(dict(enumerate(row)), label) for row, label in zip(X_train_tfidf.toarray(), train_data['Emotion'])]

In [19]:
LRC = SklearnClassifier(LogisticRegression())

In [21]:
LRC.train(train_data_3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<SklearnClassifier(LogisticRegression())>

In [30]:
val_data_3 = [(dict(enumerate(row)), label) for row, label in zip(X_val_tfidf.toarray(), val_data['Emotion'])]

In [32]:
val_predictions = [LRC.classify(dict(enumerate(row))) for row, _ in val_data_3]

In [33]:
val_data_nltk_format = [(features, label) for features, label in val_data_3]

In [44]:
accuracy_score = accuracy(LRC, val_data_3)

print(f"Accuracy: {accuracy_score}")

Accuracy: 0.6157535510053496


# ________________________________________________________________

In [81]:
model_LRC = LogisticRegression()
model_LRC.fit(X_train_tfidf, train_data['Emotion'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [82]:
from sklearn.metrics import accuracy_score as Accsss

In [83]:
predictions = model_LRC.predict(X_val_tfidf)
accuracy = Accsss(val_data['Emotion'], predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6157535510053496


In [93]:
with open('sklearn_m_LRC_61.pkl', 'wb') as model_file:
    pickle.dump(Bnb_model, model_file)

#### SGD -----------------------------------------------

In [47]:
model_SGD = SGDClassifier()
model_SGD.fit(X_train_tfidf, train_data['Emotion'])

In [53]:
predictions = model_SGD.predict(X_val_tfidf)

In [73]:
my_accuracy = Accsss(val_data['Emotion'], predictions)

In [79]:
print("Accuracy:", my_accuracy)
print("Classification Report:\n", classification_report(val_data['Emotion'], predictions))

Accuracy: 0.6150156797638812
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.32      0.40       684
           1       0.64      0.36      0.47        74
           2       0.63      0.47      0.54        85
           3       0.73      0.80      0.76      2165
           4       0.61      0.42      0.50       299
           5       0.57      0.17      0.27       522
           6       0.51      0.69      0.59      1592

    accuracy                           0.62      5421
   macro avg       0.60      0.46      0.50      5421
weighted avg       0.61      0.62      0.59      5421



In [89]:
with open('sklearn_m_SGD_61.pkl', 'wb') as model_file:
    pickle.dump(model_SGD, model_file)

# ______________________________________________________________________________________

### SVC

In [96]:
svc_model = SVC(kernel='linear')
svc_model.fit(X_train_tfidf,train_data['Emotion'])

In [97]:
predictions_svc = svc_model.predict(X_val_tfidf)

In [100]:
my_accuracy_svc = Accsss(val_data['Emotion'], predictions_svc)
print(f"Accuracy: {my_accuracy_svc:.4f}")

Accuracy: 0.6146


In [116]:
print("Classification Report:\n", classification_report(val_data['Emotion'], predictions_svc))

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.34      0.41       684
           1       0.58      0.38      0.46        74
           2       0.62      0.49      0.55        85
           3       0.79      0.75      0.77      2165
           4       0.59      0.38      0.46       299
           5       0.61      0.18      0.28       522
           6       0.49      0.76      0.60      1592

    accuracy                           0.61      5421
   macro avg       0.60      0.47      0.50      5421
weighted avg       0.63      0.61      0.60      5421



In [102]:
with open('sklearn_svc_model_61.pkl', 'wb') as model_file:
    pickle.dump(model_SGD, model_file)

### LinearSVC =================================

In [105]:
linear_svc_model = LinearSVC()
linear_svc_model.fit(X_train_tfidf,train_data['Emotion'])



In [107]:
predictions_Lsvc = linear_svc_model.predict(X_val_tfidf)

In [109]:
my_accuracy_Lsvc = Accsss(val_data['Emotion'], predictions_Lsvc)
print(f"Accuracy: {my_accuracy_Lsvc:.4f}")

Accuracy: 0.6038


In [118]:
print("Classification Report:\n", classification_report(val_data['Emotion'], predictions_Lsvc))

Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.37      0.42       684
           1       0.54      0.34      0.42        74
           2       0.66      0.52      0.58        85
           3       0.75      0.76      0.76      2165
           4       0.56      0.41      0.47       299
           5       0.44      0.24      0.31       522
           6       0.50      0.66      0.57      1592

    accuracy                           0.60      5421
   macro avg       0.56      0.47      0.50      5421
weighted avg       0.60      0.60      0.59      5421



In [111]:
with open('sklearn_Lsvc_model_60.pkl', 'wb') as model_file:
    pickle.dump(model_SGD, model_file)