In [1]:
import pandas as pd
import json
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


#Carga de datos
train_data = pd.read_csv("../../Data/BasedOnEkman/train_ekman.csv")
test_data = pd.read_csv("../../Data/BasedOnEkman/test_ekman.csv")

#Llenar los Textos vacíos con un espacio
train_data['Text'] = train_data['Text'].fillna(" ")

# Cargar el mapeo de Ekman desde el archivo JSON
with open("../../Data/GoEmotions/ekman_mapping.json", "r") as f:
    ekman_mapping = json.load(f)
ekman_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']
ekman_to_id = {label: idx for idx, label in enumerate(ekman_labels)}

# --- PREPROCESAMIENTO PARA LEMATIZACIÓN ---
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = re.findall(r'\b\w+\b', text.lower())
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

# Datos
X_train_raw = train_data['Text'].values
X_test_raw = test_data['Text'].values
y_train = train_data['Emotion'].values
y_test = test_data['Emotion'].values

# Datos lematizados
X_train_lem = [preprocess_text(text) for text in X_train_raw]
X_test_lem = [preprocess_text(text) for text in X_test_raw]


In [2]:
# --- DIRECTORIO DE SALIDA DE LOS PLOTS ---
output_dir = "../../Plots/Experiment_Multiclass/Ekman/"


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Función para graficar matriz de confusión
def plot_confusion_matrix(y_true, y_pred, labels, title):
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(labels))))

    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues", 
        xticklabels=labels,
        yticklabels=labels,
        cbar=True
    )
    plt.title(f'Matriz de Confusión - {title}')
    plt.ylabel('Etiqueta Verdadera')
    plt.xlabel('Etiqueta Predicha')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    # Guardar archivo
    filename = title.replace(" ", "_").replace("(", "").replace(")", "").lower() + ".png"
    plt.savefig(output_dir + filename)
    plt.close()
    
    

In [3]:

# --- FUNCION GENERAL PARA COMPARAR ---
def train_and_evaluate(model, X_train, y_train, X_test, y_test, label):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n--- {label} ---")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    # Mostrar matriz de confusión
    
    plot_confusion_matrix(y_test, y_pred, ekman_labels, label)

# --- VECTORIZADORES ---
vectorizer_raw = TfidfVectorizer()
vectorizer_lem = TfidfVectorizer()

X_train_raw_tfidf = vectorizer_raw.fit_transform(X_train_raw)
X_test_raw_tfidf = vectorizer_raw.transform(X_test_raw)

X_train_lem_tfidf = vectorizer_lem.fit_transform(X_train_lem)
X_test_lem_tfidf = vectorizer_lem.transform(X_test_lem)

In [4]:

# --- MODELOS ---
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM (linear)': SVC(kernel='linear', C=1),
    'Decision Tree': DecisionTreeClassifier(criterion='entropy')
}

In [None]:
# --- COMPARATIVA FINAL ---
print("\n=== SIN LEMATIZAR ===")
for model_name, model in models.items():
    train_and_evaluate(model, X_train_raw_tfidf, y_train, X_test_raw_tfidf, y_test, f"{model_name} (sin lematizar)")

print("\n=== CON LEMATIZAR ===")
for model_name, model in models.items():
    train_and_evaluate(model, X_train_lem_tfidf, y_train, X_test_lem_tfidf, y_test, f"{model_name} (lematizado)")


=== SIN LEMATIZAR ===

--- Logistic Regression (sin lematizar) ---
Accuracy: 0.6371
              precision    recall  f1-score   support

           0       0.58      0.33      0.42       572
           1       0.68      0.37      0.48        76
           2       0.73      0.44      0.55        80
           3       0.77      0.77      0.77      1863
           4       0.69      0.43      0.53       283
           5       0.63      0.22      0.32       488
           6       0.53      0.78      0.63      1606

    accuracy                           0.64      4968
   macro avg       0.66      0.48      0.53      4968
weighted avg       0.65      0.64      0.62      4968


--- SVM (linear) (sin lematizar) ---
Accuracy: 0.6345
              precision    recall  f1-score   support

           0       0.55      0.34      0.42       572
           1       0.53      0.42      0.47        76
           2       0.64      0.68      0.65        80
           3       0.79      0.74      0.77   

=== SIN LEMATIZAR ===

--- Logistic Regression (sin lematizar) ---
Accuracy: 0.6371
              precision    recall  f1-score   support

           0       0.58      0.33      0.42       572
           1       0.68      0.37      0.48        76
           2       0.73      0.44      0.55        80
           3       0.77      0.77      0.77      1863
           4       0.69      0.43      0.53       283
           5       0.63      0.22      0.32       488
           6       0.53      0.78      0.63      1606

    accuracy                           0.64      4968
   macro avg       0.66      0.48      0.53      4968
weighted avg       0.65      0.64      0.62      4968


--- SVM (linear) (sin lematizar) ---
Accuracy: 0.6345
              precision    recall  f1-score   support

           0       0.55      0.34      0.42       572
           1       0.53      0.42      0.47        76
           2       0.64      0.68      0.65        80
           3       0.79      0.74      0.77      1863
           4       0.64      0.46      0.54       283
           5       0.64      0.19      0.30       488
           6       0.53      0.79      0.64      1606

    accuracy                           0.63      4968
   macro avg       0.62      0.52      0.54      4968
weighted avg       0.65      0.63      0.62      4968


--- Decision Tree (sin lematizar) ---
Accuracy: 0.5795
              precision    recall  f1-score   support

           0       0.40      0.31      0.35       572
           1       0.43      0.33      0.37        76
           2       0.51      0.40      0.45        80
           3       0.72      0.73      0.72      1863
           4       0.56      0.40      0.47       283
           5       0.38      0.23      0.29       488
           6       0.53      0.66      0.59      1606

    accuracy                           0.58      4968
   macro avg       0.50      0.44      0.46      4968
weighted avg       0.57      0.58      0.57      4968


=== CON LEMATIZAR ===

--- Logistic Regression (lematizado) ---
Accuracy: 0.6270
              precision    recall  f1-score   support

           0       0.58      0.31      0.40       572
           1       0.69      0.33      0.45        76
           2       0.79      0.41      0.54        80
           3       0.75      0.77      0.76      1863
           4       0.69      0.40      0.51       283
           5       0.64      0.22      0.32       488
           6       0.52      0.77      0.62      1606

    accuracy                           0.63      4968
   macro avg       0.67      0.46      0.51      4968
weighted avg       0.64      0.63      0.61      4968


--- SVM (linear) (lematizado) ---
Accuracy: 0.6262
              precision    recall  f1-score   support

           0       0.54      0.33      0.41       572
           1       0.54      0.41      0.47        76
           2       0.65      0.68      0.66        80
           3       0.77      0.74      0.75      1863
           4       0.64      0.46      0.54       283
           5       0.65      0.20      0.30       488
           6       0.53      0.77      0.62      1606

    accuracy                           0.63      4968
   macro avg       0.62      0.51      0.54      4968
weighted avg       0.64      0.63      0.61      4968


--- Decision Tree (lematizado) ---
Accuracy: 0.5733
              precision    recall  f1-score   support

           0       0.42      0.32      0.36       572
           1       0.44      0.36      0.39        76
           2       0.53      0.46      0.49        80
           3       0.69      0.73      0.71      1863
           4       0.54      0.43      0.48       283
           5       0.36      0.24      0.29       488
           6       0.53      0.62      0.57      1606

    accuracy                           0.57      4968
   macro avg       0.50      0.45      0.47      4968
weighted avg       0.56      0.57      0.56      4968