In [2]:
# Librerías

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Carga de corpus

In [3]:
df_corpus = pd.read_csv('df_corpus.csv')
df_corpus.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,categoría,sentimiento,tokens,cleaned_text,review_length,cleaned_reviewText
0,A2VGK9S4DKTMF,B002V9X78U,BWallace,"[1, 1]",Let's be clear; I love how much I hate this pr...,1,"Lame, even for a white elephant gift",1386720000,2013-12-11,beauty,0,"['lets', 'clear', 'love', 'much', 'hate', 'pro...",lets clear love much hate product bought white...,121,let clear love much hate product bought white ...
1,A1N2EONG2Y6NUZ,B0001EL5R2,Shana,"[0, 0]","This made my skin so greasy and shiny, in addi...",1,Hello greasy skin!,1373587200,2013-07-12,beauty,0,"['made', 'skin', 'greasy', 'shiny', 'addition'...",made skin greasy shiny addition helping acne d...,23,made skin greasy shiny addition helping acne d...
2,ANWZD7ZYE50UE,B005MZS03C,Gilberto Prieto,"[0, 5]",i receive the perfume today and they are fake ...,1,FAKE!!!,1379894400,2013-09-23,beauty,0,"['receive', 'perfume', 'today', 'fake', 'also'...",receive perfume today fake also bought gucci g...,40,receive perfume today fake also bought gucci g...
3,A20RM3LL5IW5JO,B003UZ4R24,Oulzo,"[2, 7]",I got excited after seeing the multiple videos...,1,"Do not buy, the mask is full of alcohol!!!!!!!",1389398400,2014-01-11,beauty,0,"['got', 'excited', 'seeing', 'multiple', 'vide...",got excited seeing multiple videos youtube hea...,237,got excited seeing multiple video youtube heav...
4,A3E3GD3TABXKU1,B0017TZD7S,Loren w Christensen,"[2, 5]","Maybe I don't get the point with this. Okay, t...",2,don't get its purpose,1215734400,2008-07-11,beauty,0,"['maybe', 'dont', 'get', 'point', 'okay', 'use...",maybe dont get point okay use sex sell get sup...,165,maybe dont get point okay use sex sell get sup...


# Modelos

## División de muestra en train y test

In [4]:
y = df_corpus['sentimiento']

X_train, X_test, y_train, y_test = train_test_split(df_corpus['cleaned_reviewText'], y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

print("Distribución de clases en el conjunto de entrenamiento:")
print(y_train.value_counts(normalize=True))

# Vamos a vectorizar usando TfidfVectorizer 
tfidf_vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1, 2))  
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

Distribución de clases en el conjunto de entrenamiento:
sentimiento
1    0.5
0    0.5
Name: proportion, dtype: float64


## Entrenamos con Naive Bayes y Regresión Logística

In [5]:
# Modelo Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

# Modelo Regresión Logística
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)


## Resultados

In [6]:
def print_metrics(y_true, y_pred, model_name):
    print(f"Resultados del modelo {model_name}:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")

print_metrics(y_test, y_pred_nb, "Naive Bayes")

print_metrics(y_test, y_pred_lr, "Regresión Logística")

Resultados del modelo Naive Bayes:
Accuracy: 0.8346
Precision: 0.8373
Recall: 0.8306
F1 Score: 0.8339
Resultados del modelo Regresión Logística:
Accuracy: 0.8446
Precision: 0.8509
Recall: 0.8356
F1 Score: 0.8432


In [7]:
# Voy a guardar las predicciones para el siguiente paso donde haremos un reporte de métricas

predictions_df = pd.DataFrame({
    'true_labels': y_test,
    'nb_predictions': y_pred_nb,
    'lr_predictions': y_pred_lr
})

predictions_df.to_csv('model_predictions.csv', index=False)

# Primeras conclusiones

- A priori, la Regresión Logística parece dar mejores resultados que Naive Bayes.