In [1]:

import pandas as pd
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# ***** CARGA DE DATOS *****
train_data = pd.read_csv("../../Data/BasedOnSentiments/train_sentiments.csv")
test_data = pd.read_csv("../../Data/BasedOnSentiments/test_sentiments.csv")

#LLenar los valores NaN con una cadena vacía
train_data['Text'] = train_data['Text'].fillna(" ")

# --- PREPROCESAMIENTO PARA LEMATIZACIÓN ---
#nltk.download('punkt')
#nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = re.findall(r'\b\w+\b', text.lower())
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

# Datos
X_train_raw = train_data['Text'].values
X_test_raw = test_data['Text'].values
y_train = train_data['Sentiment'].values
y_test = test_data['Sentiment'].values

# Datos lematizados
X_train_lem = [preprocess_text(text) for text in X_train_raw]
X_test_lem = [preprocess_text(text) for text in X_test_raw]

In [2]:
# --- DIRECTORIO DE SALIDA DE LOS PLOTS ---
output_dir = "../../Plots/Experiment_Multiclass/Sentiments/"

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Función para graficar matriz de confusión
def plot_confusion_matrix(y_true, y_pred, labels, title):
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(labels))))

    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues", 
        xticklabels=labels,
        yticklabels=labels,
        cbar=True
    )
    plt.title(f'Matriz de Confusión - {title}')
    plt.ylabel('Etiqueta Verdadera')
    plt.xlabel('Etiqueta Predicha')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    # Guardar archivo
    filename = title.replace(" ", "_").replace("(", "").replace(")", "").lower() + ".png"
    plt.savefig(output_dir + filename)
    plt.close()

In [3]:
# --- FUNCION GENERAL PARA COMPARAR ---
def train_and_evaluate(model, X_train, y_train, X_test, y_test, label):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n--- {label} ---")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    # Mostrar matriz de confusión
    sentiment_labels = ["Positive", "Negative", "Ambiguous", "Neutral"]
    
    plot_confusion_matrix(y_test, y_pred, sentiment_labels, label)

In [4]:
# --- VECTORIZADORES ---
vectorizer_raw = TfidfVectorizer()
vectorizer_lem = TfidfVectorizer()

X_train_raw_tfidf = vectorizer_raw.fit_transform(X_train_raw)
X_test_raw_tfidf = vectorizer_raw.transform(X_test_raw)

X_train_lem_tfidf = vectorizer_lem.fit_transform(X_train_lem)
X_test_lem_tfidf = vectorizer_lem.transform(X_test_lem)

In [5]:
# --- MODELOS ---
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM (linear)': SVC(kernel='linear', C=1),
    'Decision Tree': DecisionTreeClassifier(criterion='entropy')
}

In [6]:
# --- COMPARATIVA FINAL ---
print("\n=== SIN LEMATIZAR ===")
for model_name, model in models.items():
    train_and_evaluate(model, X_train_raw_tfidf, y_train, X_test_raw_tfidf, y_test, f"{model_name} (sin lematizar)")

print("\n=== CON LEMATIZAR ===")
for model_name, model in models.items():
    train_and_evaluate(model, X_train_lem_tfidf, y_train, X_test_lem_tfidf, y_test, f"{model_name} (lematizado)")



=== SIN LEMATIZAR ===

--- Logistic Regression (sin lematizar) ---
Accuracy: 0.6539
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1863
           1       0.68      0.52      0.59      1070
           2       0.64      0.21      0.32       488
           3       0.54      0.75      0.63      1606

    accuracy                           0.65      5027
   macro avg       0.66      0.56      0.58      5027
weighted avg       0.67      0.65      0.64      5027


--- SVM (linear) (sin lematizar) ---
Accuracy: 0.6543
              precision    recall  f1-score   support

           0       0.79      0.75      0.77      1863
           1       0.69      0.53      0.60      1070
           2       0.63      0.19      0.30       488
           3       0.54      0.77      0.63      1606

    accuracy                           0.65      5027
   macro avg       0.66      0.56      0.57      5027
weighted avg       0.67      0.65      0.64  


=== SIN LEMATIZAR ===

--- Logistic Regression (sin lematizar) ---
Accuracy: 0.6539
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      1863
           1       0.68      0.52      0.59      1070
           2       0.64      0.21      0.32       488
           3       0.54      0.75      0.63      1606

    accuracy                           0.65      5027
   macro avg       0.66      0.56      0.58      5027
weighted avg       0.67      0.65      0.64      5027


--- SVM (linear) (sin lematizar) ---
Accuracy: 0.6543
              precision    recall  f1-score   support

           0       0.79      0.75      0.77      1863
           1       0.69      0.53      0.60      1070
           2       0.63      0.19      0.30       488
           3       0.54      0.77      0.63      1606

    accuracy                           0.65      5027
   macro avg       0.66      0.56      0.57      5027
weighted avg       0.67      0.65      0.64      5027


--- Decision Tree (sin lematizar) ---
Accuracy: 0.6025
              precision    recall  f1-score   support

           0       0.73      0.73      0.73      1863
           1       0.58      0.49      0.53      1070
           2       0.36      0.20      0.26       488
           3       0.53      0.65      0.58      1606

    accuracy                           0.60      5027
   macro avg       0.55      0.52      0.53      5027
weighted avg       0.60      0.60      0.59      5027


=== CON LEMATIZAR ===

--- Logistic Regression (lematizado) ---
Accuracy: 0.6467
              precision    recall  f1-score   support

           0       0.76      0.75      0.76      1863
           1       0.68      0.51      0.59      1070
           2       0.68      0.21      0.32       488
           3       0.53      0.74      0.62      1606

    accuracy                           0.65      5027
   macro avg       0.67      0.56      0.57      5027
weighted avg       0.67      0.65      0.64      5027


--- SVM (linear) (lematizado) ---
Accuracy: 0.6447
              precision    recall  f1-score   support

           0       0.76      0.74      0.75      1863
           1       0.68      0.53      0.60      1070
           2       0.63      0.20      0.30       488
           3       0.53      0.74      0.62      1606

    accuracy                           0.64      5027
   macro avg       0.65      0.55      0.57      5027
weighted avg       0.66      0.64      0.63      5027


--- Decision Tree (lematizado) ---
Accuracy: 0.5944
              precision    recall  f1-score   support

           0       0.71      0.73      0.72      1863
           1       0.55      0.50      0.52      1070
           2       0.37      0.23      0.28       488
           3       0.53      0.62      0.57      1606

    accuracy                           0.59      5027
   macro avg       0.54      0.52      0.52      5027
weighted avg       0.59      0.59      0.59      5027
