In [26]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# Carga de datos
train_data = pd.read_csv("../Data/train_indexado.csv")
valid_data = pd.read_csv("../Data/valid_indexado.csv")
test_data = pd.read_csv("../Data/test_indexado.csv")

# Definición de grupos emocionales
positive = ['admiration', 'amusement', 'approval', 'caring', 'desire',
            'excitement', 'gratitude', 'joy', 'love', 'optimism', 
            'pride', 'relief']
negative = ['anger', 'annoyance', 'disappointment', 'disapproval',
            'disgust', 'embarrassment', 'fear', 'grief', 'nervousness',
            'remorse', 'sadness']
ambiguous = ['confusion', 'curiosity', 'realization', 'surprise']
neutral = ['neutral']

sentiment_map = {e: "positive" for e in positive}
sentiment_map.update({e: "negative" for e in negative})
sentiment_map.update({e: "ambiguous" for e in ambiguous})
sentiment_map.update({e: "neutral" for e in neutral})
sentiment_labels = ["positive", "negative", "ambiguous", "neutral"]

sentiments_to_id = {label: i for i, label in enumerate(sentiment_labels)}
id_to_sentiments = {idx: label for label, idx in sentiments_to_id.items()}

# Determinar categorías de emociones
def get_sentiment_emotions(row):
    active = set()
    for emotion in row.index:
        if row[emotion] == 1 and emotion in sentiment_map:
            active.add(sentiment_map[emotion])
    return list(active)


# Función para graficar distribuciones
def plot_sentiments_distribution(sentiment_lists, title, filename, color):
    flat_sentiments = [e for sublist in sentiment_lists for e in sublist]
    counts = Counter(flat_sentiments)

    plt.figure(figsize=(10, 5))
    bars = plt.bar(counts.keys(), counts.values(), color=color)
    plt.title(title)
    plt.ylabel("Número de registros")
    plt.xticks(rotation=45)

    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height, str(height),
                 ha='center', va='bottom', fontsize=9)
        
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()



def plot_sentiments_distribution_after(sentiment_ids, title, filename, color):
    labels = [id_to_sentiments[i] for i in sentiment_ids]
    counts = Counter(labels)

    plt.figure(figsize=(10, 5))
    bars = plt.bar(counts.keys(), counts.values(), color=color)
    plt.title(title)
    plt.ylabel("Número de registros")
    plt.xticks(rotation=45)

    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height, str(height),
                 ha='center', va='bottom', fontsize=9)
        
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()


def plot_distribution(solo_una, conflictivos, title, filename, color):
    labels = ["Un solo sentimiento", "Múltiples sentimientos"]
    values = [solo_una, conflictivos]

    plt.figure(figsize=(7, 5))
    bars = plt.bar(labels, values, color=color)
    plt.title(title)
    plt.ylabel("Número de registros")

    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height, str(height),
                 ha='center', va='bottom', fontsize=9)
        
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()


# Aplicar la función a los datos
train_data["Sentiments"] = train_data.apply(get_sentiment_emotions, axis=1)
valid_data["Sentiments"] = valid_data.apply(get_sentiment_emotions, axis=1)
test_data["Sentiments"]  = test_data.apply(get_sentiment_emotions, axis=1)

# Graficar distribuciones de sentimientos
plot_sentiments_distribution(train_data["Sentiments"], "Train: Based On Sentiments (Antes)", "../Plots/Experiment1/Sentiments/train_sentiments_before.png", "skyblue")
plot_sentiments_distribution(valid_data["Sentiments"], "Valid: Based On Sentiments (Antes)", "../Plots/Experiment1/Sentiments/valid_sentiments_before.png", "orange")
plot_sentiments_distribution(test_data["Sentiments"], "Test: Based On Sentiments (Antes)", "../Plots/Experiment1/Sentiments/test_sentiments_before.png", "lightgreen")




# Función para obtener índices conflictivos
def get_conflicting_indices(df):
    return [i for i, s in enumerate(df["Sentiments"]) if len(s) > 1]

# Aplciar la función para obtener índices conflictivos
conflict_train_idx = get_conflicting_indices(train_data)
conflict_valid_idx = get_conflicting_indices(valid_data)
conflict_test_idx  = get_conflicting_indices(test_data)


solo_una_train = len(train_data) - len(conflict_train_idx)
solo_una_valid = len(valid_data) - len(conflict_valid_idx)
solo_una_test  = len(test_data) - len(conflict_test_idx)

#Graficar distrubución de sentimientos según la cantidad
plot_distribution(solo_una_train, len(conflict_train_idx), 
                  "Train: Un sentimiento vs. Más de uno", 
                  "../Plots/Experiment1/Sentiments/Distribution/distribucion_train.png", 
                  "lightcoral")

plot_distribution(solo_una_valid, len(conflict_valid_idx), 
                  "Valid: Un sentimiento vs. Más de uno", 
                  "../Plots/Experiment1/Sentiments/Distribution/distribucion_valid.png", 
                  "lightsalmon")

plot_distribution(solo_una_test, len(conflict_test_idx), 
                  "Test: Un sentimiento vs. Más de uno", 
                  "../Plots/Experiment1/Sentiments/Distribution/distribucion_test.png", 
                  "lightseagreen")

print("\nCONTEO DE REGISTROS POR CATEGORÍAS DE SENTIMIENTO:")
print("Train - Solo una categoría:", solo_una_train, "| Más de una:", len(conflict_train_idx))
print("Valid - Solo una categoría:", solo_una_valid, "| Más de una:", len(conflict_valid_idx))
print("Test  - Solo una categoría:", solo_una_test,  "| Más de una:", len(conflict_test_idx))




CONTEO DE REGISTROS POR CATEGORÍAS DE SENTIMIENTO:
Train - Solo una categoría: 40030 | Más de una: 3380
Valid - Solo una categoría: 5006 | Más de una: 420
Test  - Solo una categoría: 5027 | Más de una: 400


In [27]:
# Eliminar registros conflictivos
train_clean = train_data.drop(index=conflict_train_idx).reset_index(drop=True)
valid_clean = valid_data.drop(index=conflict_valid_idx).reset_index(drop=True)
test_clean  = test_data.drop(index=conflict_test_idx).reset_index(drop=True)


#Enumerar etiquetas de sentimientos
sentiment_to_id = {label: idx for idx, label in enumerate(sentiment_labels)}
# Convertir etiquetas de sentimientos a IDs
def categorize_sentiments(row):
    for sentiment in row['Sentiments']:
        if sentiment in sentiment_to_id:
            return sentiment_to_id[sentiment]
    return len(sentiment_labels)  # Valor por defecto si algo falla

train_data_clean = train_clean.copy()
valid_data_clean = valid_clean.copy()
test_data_clean  = test_clean.copy()
train_data_clean['Sentiment'] = train_data_clean.apply(categorize_sentiments, axis=1)
valid_data_clean['Sentiment'] = valid_data_clean.apply(categorize_sentiments, axis=1)
test_data_clean['Sentiment']  = test_data_clean.apply(categorize_sentiments, axis=1)

# Graficar distribuciones de sentimientos después de la limpieza
plot_sentiments_distribution_after(train_data_clean['Sentiment'], "Train: Based On Sentiments (Después)", "../Plots/Experiment1/Sentiments/train_sentiments_after.png", "skyblue")
plot_sentiments_distribution_after(valid_data_clean['Sentiment'], "Valid: Based On Sentiments (Después)", "../Plots/Experiment1/Sentiments/valid_sentiments_after.png", "orange")
plot_sentiments_distribution_after(test_data_clean['Sentiment'], "Test: Based On Sentiments (Después)", "../Plots/Experiment1/Sentiments/test_sentiments_after.png", "lightgreen")

# Guardar archivos limpios
train_data_clean[["Text", "Sentiment", "ID"]].to_csv('../Data/BasedOnSentiments/train_sentiments.csv', index=False)
valid_data_clean[["Text", "Sentiment", "ID"]].to_csv('../Data/BasedOnSentiments/valid_sentiments.csv', index=False)
test_data_clean[["Text", "Sentiment", "ID"]].to_csv('../Data/BasedOnSentiments/test_sentiments.csv', index=False)

print("\nArchivos guardados correctamente.")


Archivos guardados correctamente.


In [28]:
# Aplica TF-IDF sin sobreescribir train_data_clean
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

# Aplicar el vectorizador
sentences_train = train_data_clean['Text'].values

temp = TfidfVectorizer()

vectorizer.fit(sentences_train)

feature_names = vectorizer.get_feature_names_out()
n_features = len(feature_names)
print(f"Número de features detectados: {n_features}")




Número de features detectados: 25311
