# Análisis de Sentimientos con NLTK VADER
## Análisis de reseñas de Amazon
Análisis completo usando NLTK con VADER lexicon

In [None]:
!pip install nltk scikit-learn matplotlib seaborn
import nltk
nltk.download("vader_lexicon")
nltk.download("punkt")

In [None]:
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
sia = SentimentIntensityAnalyzer()

In [None]:
df = pd.read_csv("amazon_cells_labelled.txt", sep="\t", header=None, names=["review", "sentiment"])
print(f"Datos cargados: {len(df)} reseñas")
print(f"\nDistribución de sentimientos:")
print(df["sentiment"].value_counts())
print(f"\nPrimeras reseñas:")
print(df.head())

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\\S+|www\\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\\s.!?,-]", "", text)
    text = re.sub(r"\\s+", " ", text).strip()
    return text
df["review_cleaned"] = df["review"].apply(clean_text)
df = df[df["review_cleaned"].str.len() > 0]
print(f"Dataset después de limpieza: {len(df)} reseñas")

In [None]:
sentiment_scores = df["review_cleaned"].apply(lambda x: sia.polarity_scores(x))
sentiment_df = pd.DataFrame(sentiment_scores.tolist())
df = pd.concat([df, sentiment_df], axis=1)
df = df.rename(columns={"neg": "vader_negative", "neu": "vader_neutral", "pos": "vader_positive", "compound": "vader_compound"})
print("VADER scores aplicados")
print(df[["review", "sentiment", "vader_compound"]].head(10))

In [None]:
df["vader_prediction"] = df["vader_compound"].apply(lambda x: 1 if x >= 0.05 else 0)
print("Predicciones VADER creadas")
print(f"Distribución de predicciones:")
print(df["vader_prediction"].value_counts())

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
y_true = df["sentiment"].values
y_pred = df["vader_prediction"].values
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
print("=" * 60)
print("MÉTRICAS DE CALIDAD")
print("=" * 60)
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")

In [None]:
cm = confusion_matrix(y_true, y_pred)
print("\nMatriz de Confusión:")
print(f"{\"\":<20} Predicción Neg  Predicción Pos")
print(f"Real Negativo:      {cm[0,0]:>6}           {cm[0,1]:>6}")
print(f"Real Positivo:      {cm[1,0]:>6}           {cm[1,1]:>6}")
print("\nReporte de Clasificación:")
print(classification_report(y_true, y_pred, target_names=["Negativo", "Positivo"]))

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negativo", "Positivo"], yticklabels=["Negativo", "Positivo"])
plt.title("Matriz de Confusión")
plt.ylabel("Real")
plt.xlabel("Predicción")
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].hist(df["vader_compound"], bins=50, edgecolor="black", alpha=0.7)
axes[0].axvline(x=0.05, color="green", linestyle="--", label="Threshold Pos")
axes[0].set_xlabel("Compound Score")
axes[0].set_title("Distribución de Scores")
axes[0].legend()
df.boxplot(column="vader_compound", by="sentiment", ax=axes[1])
axes[1].set_xlabel("Sentimiento Real")
axes[1].set_title("Compound por Sentimiento")
plt.suptitle("")
plt.show()

In [None]:
# Análisis de palabras influyentes
positive_reviews = df[df["sentiment"] == 1]["review_cleaned"].str.cat(sep=" ")
negative_reviews = df[df["sentiment"] == 0]["review_cleaned"].str.cat(sep=" ")
positive_words = word_tokenize(positive_reviews)
negative_words = word_tokenize(negative_reviews)
positive_freq = Counter([w for w in positive_words if len(w) > 2])
negative_freq = Counter([w for w in negative_words if len(w) > 2])
print("Palabras en RESEÑAS POSITIVAS:")
for word, freq in positive_freq.most_common(10):
    print(f"  {word:<15} - {freq:>4} veces")
print("\nPalabras en RESEÑAS NEGATIVAS:")
for word, freq in negative_freq.most_common(10):
    print(f"  {word:<15} - {freq:>4} veces")

In [None]:
# Análisis de casos mal clasificados
df["correct"] = df["sentiment"] == df["vader_prediction"]
misclassified = df[~df["correct"]]
correctly_classified = df[df["correct"]]
fp = misclassified[(misclassified["sentiment"] == 0) & (misclassified["vader_prediction"] == 1)]
fn = misclassified[(misclassified["sentiment"] == 1) & (misclassified["vader_prediction"] == 0)]
print("ANÁLISIS DE ERRORES")
print(f"Correctas: {len(correctly_classified)} ({len(correctly_classified)/len(df)*100:.2f}%)")
print(f"Incorrectas: {len(misclassified)} ({len(misclassified)/len(df)*100:.2f}%)")
print(f"Falsos Positivos: {len(fp)}")
print(f"Falsos Negativos: {len(fn)}")
print("\nEjemplos Falsos Positivos:")
for idx in fp.head(2).index:
    print(f"  {df.loc[idx, \"review\"][:60]}...")
print("\nEjemplos Falsos Negativos:")
for idx in fn.head(2).index:
    print(f"  {df.loc[idx, \"review\"][:60]}...")