In [7]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [2]:
sentence = "This is a test sentence"

In [3]:
data = pd.read_csv('data.csv')
stop_words= stopwords.words('english')
vectorizer = TfidfVectorizer(stop_words=stop_words, token_pattern=r'\b[a-zA-ZÁÉÍÓÚáéíóú]{4,}\b')
tfidf = vectorizer.fit_transform(data['Text'])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(tfidf, data['Emotion'], test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred, labels=data['Emotion'].unique())
print(report)

              precision    recall  f1-score   support

     sadness       0.92      0.89      0.90      1277
       anger       0.89      0.86      0.87       617
        love       0.81      0.70      0.75       318
    surprise       0.76      0.72      0.74       168
        fear       0.85      0.83      0.84       531
       happy       0.86      0.93      0.89      1381

    accuracy                           0.87      4292
   macro avg       0.85      0.82      0.83      4292
weighted avg       0.87      0.87      0.87      4292



In [8]:

X_train, X_test, y_train, y_test = train_test_split(tfidf, data['Emotion'], test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 6, 9],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [3, 6, 9]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)

y_pred = grid_search.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print("F1 score: ", f1)

report = classification_report(y_test, y_pred, labels=data['Emotion'].unique())
print(report)