In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# CARGA DE DATOS 
train_data = pd.read_csv("../../Data/Chi2/train_2000_chi2.csv")
test_data = pd.read_csv("../../Data/test_indexado.csv")

# LLenar los registros vacíos del train
train_data['Text'] = train_data['Text'].fillna('')

# Definir las clases de emociones
emotion_classes = train_data.columns[2:].tolist()

In [2]:
# TF-IDF VECTORIZACIÓN
vectorizer = TfidfVectorizer(lowercase=True, stop_words="english", strip_accents="unicode", max_features=5000)
X_train = vectorizer.fit_transform(train_data['Text'].values)
X_test = vectorizer.transform(test_data['Text'].values)
y_train = np.asarray(train_data[emotion_classes])
y_test = np.asarray(test_data[emotion_classes])

In [3]:
#Spliteo del conjunto de entrenamiento para validación
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


# Random Forest Classifier para clasificación multi-etiqueta
base_model = RandomForestClassifier(random_state=42)
rf_model = MultiOutputClassifier(base_model)
rf_model.fit(X_train, y_train)

#Predicción
y_pred = rf_model.predict(X_val)

In [4]:
# EVALUACIÓN

print("\nResultados Random Forest")
print("Accuracy:", accuracy_score(y_val, y_pred))

print("\nReporte de clasificación:\n", classification_report(y_val, y_pred, target_names=emotion_classes, zero_division=0))



Resultados Random Forest
Accuracy: 0.3618981801428242

Reporte de clasificación:
                 precision    recall  f1-score   support

    admiration       0.69      0.50      0.58       863
     amusement       0.74      0.73      0.73       453
         anger       0.53      0.24      0.33       323
     annoyance       0.40      0.08      0.13       483
      approval       0.45      0.09      0.15       577
        caring       0.44      0.11      0.18       212
     confusion       0.43      0.09      0.15       258
     curiosity       0.51      0.09      0.15       460
        desire       0.60      0.23      0.33       128
disappointment       0.34      0.06      0.10       244
   disapproval       0.24      0.05      0.09       383
       disgust       0.62      0.25      0.36       156
 embarrassment       0.71      0.29      0.41        58
    excitement       0.45      0.15      0.22       175
          fear       0.63      0.34      0.44       116
     gratitude      

Resultados Random Forest
Accuracy: 0.3618981801428242

Reporte de clasificación:
                 precision    recall  f1-score   support

    admiration       0.69      0.50      0.58       863
     amusement       0.74      0.73      0.73       453
         anger       0.53      0.24      0.33       323
     annoyance       0.40      0.08      0.13       483
      approval       0.45      0.09      0.15       577
        caring       0.44      0.11      0.18       212
     confusion       0.43      0.09      0.15       258
     curiosity       0.51      0.09      0.15       460
        desire       0.60      0.23      0.33       128
disappointment       0.34      0.06      0.10       244
   disapproval       0.24      0.05      0.09       383
       disgust       0.62      0.25      0.36       156
 embarrassment       0.71      0.29      0.41        58
    excitement       0.45      0.15      0.22       175
          fear       0.63      0.34      0.44       116
     gratitude       0.96      0.84      0.90       544
         grief       0.00      0.00      0.00        14
           joy       0.61      0.33      0.43       314
          love       0.73      0.73      0.73       437
   nervousness       0.36      0.11      0.17        35
       neutral       0.60      0.57      0.58      2812
      optimism       0.63      0.37      0.47       324
         pride       0.25      0.04      0.07        24
   realization       0.38      0.11      0.17       223
        relief       0.00      0.00      0.00        27
       remorse       0.57      0.41      0.48       110
       sadness       0.61      0.27      0.38       276
      surprise       0.63      0.29      0.40       205

     micro avg       0.63      0.39      0.48     10234
     macro avg       0.50      0.26      0.33     10234
  weighted avg       0.58      0.39      0.44     10234
   samples avg       0.44      0.42      0.42     10234

In [None]:
param_grid = {
    'estimator__n_estimators': [10, 100, 200, 300],
    'estimator__max_depth': [3, 5, 10, 20, None],
    'estimator__max_features': [None, 'sqrt', 'log2'],
    'estimator__min_samples_leaf': [1, 2, 4],
    'estimator__bootstrap': [True, False]
}

rf_model = MultiOutputClassifier(RandomForestClassifier(random_state=42))
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=10, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf= grid_search.best_estimator_

print("\nMejores parámetros encontrados:", grid_search.best_params_)
print("\nMejor score de validación:", grid_search.best_score_)