# NLP

In [None]:

#Importar librerias
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import random
from datetime import datetime, timedelta
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer


In [12]:
# Cargar CSV con manejo de tipos de datos
df = pd.read_csv('data/df_2.csv', low_memory=False)

## 1️⃣ Preprocesamiento del texto

In [None]:
# Descargar recursos de NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jose-\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jose-\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jose-\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
df.columns

Index(['date', 'HORA', 'session_id', 'device_id', 'device_type', 'os',
       'date_id', 'quantity', 'abandonment_time', 'Product Name', 'Category',
       'Price', 'Discount', 'Tax', 'Stock Level', 'Customer ID', 'Age',
       'Age Group', 'Location', 'gender', 'Id_compra', 'Supplier ID',
       'Shipping Cost', 'Shipping Method', 'Rate', 'Seasonality', 'Popularity',
       'reviewId', 'content', 'score', 'thumbsUpCount', 'at', 'replyContent',
       'Product ID'],
      dtype='object')

In [17]:
nltk.data.path.append("C:/Users/jose-/AppData/Roaming/nltk_data")
# Configurar el stemmer y las stopwords en español
stop_words = set(stopwords.words('spanish'))
stemmer = SnowballStemmer('spanish')

# Función para limpiar texto
def limpiar_texto(texto):
    texto = texto.lower()  # Convertir a minúsculas
    texto = re.sub(r'\W', ' ', texto)  # Eliminar caracteres especiales
    palabras = word_tokenize(texto)  # Tokenización
    palabras = [stemmer.stem(p) for p in palabras if p not in stop_words]  # Stemming y eliminar stopwords
    return ' '.join(palabras)

# Aplicar preprocesamiento a la columna 'content'
df['content_clean'] = df['content'].astype(str).apply(limpiar_texto)

In [19]:
# Ver las primeras filas del resultado
df[['content', 'content_clean']].head()

Unnamed: 0,content,content_clean
0,Muy buena calidad por el precio. es exactament...,buen calid preci exact describ
1,La velocidad y rendimiento son excelentes. el ...,veloc rendimient excelent material primer calid
2,Perfectos para uso diario. el tamaño no era el...,perfect uso diari tamañ esper
3,Ideal para los amantes del género. se nota que...,ideal amant gener not product bien fabric
4,El diseño es elegante y muy funcional. el enví...,diseñ eleg funcional envi rap


## 2️⃣ Convertir texto en representaciones numéricas

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=500)  # Usamos solo las 500 palabras más relevantes
X = vectorizer.fit_transform(df['content_clean'])

## 3️⃣ Entrenar un modelo de clasificación

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [22]:
# Convertir el score en categorías (ejemplo: 1-6: Negativo, 7: Neutro, 8-10: Positivo)
df['sentiment'] = df['score'].apply(lambda x: 'negativo' if x <= 6 else ('neutro' if x == 7 else 'positivo'))

In [23]:
# Variables independientes (X) y dependientes (y)
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

In [27]:
# Entrenar modelo
modelo = RandomForestClassifier()
modelo.fit(X_train, y_train)

In [28]:
# Predicciones y evaluación
y_pred = modelo.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    negativo       1.00      1.00      1.00    129831
      neutro       0.27      0.00      0.00     20389
    positivo       0.71      1.00      0.83     49780

    accuracy                           0.90    200000
   macro avg       0.66      0.67      0.61    200000
weighted avg       0.85      0.90      0.86    200000



In [None]:
# Analizar el mmodelo usando metricas accuracy, recall, f1, etc
from sklearn.metrics import accuracy_score, recall_score, f1_score
# accuracy                           

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

accuracy, recall, f1

           

(0.897985, 0.897985, 0.8557870751925052)

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
    
# Evaluar modelo
accuracy = model.score(X_test, y_test)
print(f"Precisión del modelo: {accuracy:.2f}")

Precisión del modelo: 0.83


In [32]:
# Matriz de confusion

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[129831,      0,      0],
       [     0,      8,  20381],
       [     0,     22,  49758]], dtype=int64)