In [None]:
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
###STEP 1: PRE-PROCESSING TEXT
filename = 'text.csv'
df = pd.read_csv(filename)
df.drop('Unnamed: 0', axis=1, inplace=True, errors='raise')
# TfidfVectorizer converts a collection to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', max_df=0.7, min_df=5)

In [None]:
###STEP 2: READING THE FILE, SPLITTING A DATASET INTO TRAIN(90%) AND TEST(10%) SETS
df.to_csv('df')
X = vectorizer.fit_transform(df['text'])
Y = df['label']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, shuffle=True,
                                                    random_state=69)  #Disabling the shuffle doesn't affect the model's performance by a large margin
print('Number of training datapoints:', X_train.shape[0])
print('Number of test datapoint:', X_test.shape[0])

In [None]:
###STEP 3: ANALYSIS OF THE FEATURE OF THE TRAINING DATA
label_count = Y_train.value_counts()
labels = ['Sadness', 'Joy', 'Love', 'Anger', 'Fear', 'Surprise']
fig, axs = plt.subplots()
wedges, texts, autotexts = axs.pie(
    label_count,
    labels=labels,
    startangle=90,
    autopct='%.0f%%',
    wedgeprops=dict(width=0.5)
)
axs.legend(wedges, labels, title=" ", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
plt.tight_layout()
plt.show()

In [None]:
###STEP 4: BUILDING A TEXT CLASSIFICATION MODEL
#Initialization and training of multinomial logistic regression classifier
warnings.filterwarnings('ignore', category=ConvergenceWarning)
#It's no bug, the model happens to work best at 100 iterations.
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=100, )
classifier.fit(X_train, Y_train)

In [None]:
###STEP 5: MODEL PERFORMANCE SUMMARY
Y_pred = classifier.predict(X_test)
#The model's ceiling
print("Model accuracy:", format(accuracy_score(Y_test, Y_pred), ".3%"))
print(classification_report(Y_test, Y_pred))

In [None]:
#Visualisation of end results to reflect the model's accuracy
cm = confusion_matrix(Y_test, Y_pred, labels=classifier.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=['Sadness', 'Joy', 'Love', 'Anger', 'Fear', 'Surprise'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix', fontsize=15, pad=20)
plt.xlabel('Predicted labels ', fontsize=11)
plt.ylabel('True labels', fontsize=11)
plt.gca().xaxis.set_label_position('top')
plt.gca().xaxis.tick_top()
plt.gca().figure.subplots_adjust(bottom=0.2)
plt.gca().figure.text(0.5, 0.05, '', ha='center', fontsize=13)
plt.tight_layout()
plt.show()

In [None]:
#Testing the Model
def predict_label(input):
    text_vec = vectorizer.transform([input])
    prediction = classifier.predict(text_vec)
    return prediction

In [None]:
# Example usage.
sample_input = "tongue-tied I live life but just don't fit in"
result = predict_label(sample_input)
print('The predicted emotion: ')
match result:
    case 0:
        print('Sadness')
    case 1:
        print('Joy')
    case 2:
        print('Love')
    case 3:
        print('Anger')
    case 4:
        print('Fear')
    case 5:
        print('Surprise')