In [2]:
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

required_packages = [
    "pandas",
    "scikit-learn",
    "nltk",
    "numpy"
]

for package in required_packages:
    try:
        __import__(package)
        print(f'{package} ya está instalado.')
    except ImportError:
        print(f'{package} no está instalado. Procediendo a instalar...')
        install(package)


pandas ya está instalado.
scikit-learn no está instalado. Procediendo a instalar...
nltk ya está instalado.
numpy ya está instalado.


In [3]:
import pandas as pd
import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from utils_processor.processor import Processor
from nltk.corpus import stopwords
import pandas as pd
from nltk.corpus import stopwords
from utils_processor.processor import Processor
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np



In [4]:
processor_ = Processor()

In [5]:
def print_progress_bar(percentage):
    bar_length = 50  # Definir la longitud de la barra de progreso
    filled_length = int(bar_length * percentage // 100)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    print(f'\rProgress: |{bar}| {percentage}% Complete', end='\r')

def log_category_progress(category, step, total_steps):
    percentage = int((step / total_steps) * 100)
    print(f"\nProcessing {category} ({step}/{total_steps})")
    print_progress_bar(percentage)
    time.sleep(0.5)  # Solo para simular el tiempo de procesamiento

def read_file(file_location) -> pd.DataFrame:
    print(f"Reading file: {file_location}")
    with open(file_location, 'r') as file:
        data = file.readlines()

    rows = []
    for i in range(0, len(data)):
        terms = {}
        # Separar la parte de los términos del label
        line_parts = data[i].split(" #label#:")
        terms_part = line_parts[0].split()  # Parte con los términos
        label = line_parts[1].strip()  # El label (positive o negative)

        # Iterar sobre los términos y extraer el valor
        for j in terms_part:
            if ":" in j:
                term, value = j.split(":")
                terms[term] = int(value)

        # Crear un diccionario para la fila con el formato { 'terms': {términos}, 'label': label }
        rows.append({
            "terms": terms,  # El diccionario con los términos y sus valores
            "label": label   # El label ('positive' o 'negative')
        })

    # Crear un DataFrame con dos columnas: 'terms' y 'label'
    dataFrame = pd.DataFrame(rows)

    return dataFrame

Se realiza una carga de datos para los sentimientos considerados negativos

In [6]:
df_train = read_file("data/MultiDomainSentiment/negative.review")
df_train

Reading file: data/MultiDomainSentiment/negative.review


Unnamed: 0,terms,label
0,"{'avid': 1, 'your': 1, 'horrible_book': 1, 'wa...",negative
1,"{'to_use': 1, 'shallow': 1, 'found': 1, 'he_ca...",negative
2,"{'avid': 1, 'your': 1, 'horrible_book': 1, 'wa...",negative
3,"{'book_seriously': 1, 'we': 1, 'days_couldn't'...",negative
4,"{'mass': 1, 'only': 1, 'he': 2, 'help': 1, '""j...",negative
...,...,...
995,"{'only': 1, 'idiotic_anyone': 1, 'if_i': 1, 'm...",negative
996,"{'your': 1, 'well': 1, 'to_create': 1, 'peter'...",negative
997,"{'favorable_reviews': 1, 'heard': 1, 'straight...",negative
998,"{'helpful': 1, 'this_one': 1, 'substance_and':...",negative


Se exporta como un csv

In [7]:
df_train.to_csv("negative.review.csv")

Se realiza la carga de sentimientos no marcados

In [8]:
df_validation = read_file("data/MultiDomainSentiment/unlabeled.review")
df_validation

Reading file: data/MultiDomainSentiment/unlabeled.review


Unnamed: 0,terms,label
0,"{'is_such': 1, 'feel': 1, 'pages': 1, 'if': 1,...",negative
1,"{'go_mercy': 1, 'forget': 1, 'all_the': 1, 'pi...",negative
2,"{'explanation_of': 1, 'plains': 1, 'bison': 5,...",positive
3,"{'stars': 1, 'bold_kudos': 1, 'every_pedophile...",positive
4,"{'doesn't_say': 1, 'their_class': 1, 'say_much...",negative
...,...,...
4460,"{'mass': 1, 'specifically_references': 1, 'sci...",negative
4461,"{'reviewer's_comments': 1, 'to_pick': 1, 'comm...",negative
4462,"{'x-ers': 1, 'entry-level': 1, 'can_dip': 1, '...",positive
4463,"{'your': 1, 'well': 1, 'around_for': 1, 'you'r...",positive


Se exporta en un csv

In [9]:
df_validation.to_csv("unlabeled.review.csv")

Se combinan los datasets de entrenamiento y prueba

In [10]:
def process_category(category, negative_file, positive_file, testing_file):
    total_steps = 3
    step = 1

    log_category_progress(category, step, total_steps)
    df_negative = read_file(negative_file)

    step += 1
    log_category_progress(category, step, total_steps)
    df_positive = read_file(positive_file)

    step += 1
    log_category_progress(category, step, total_steps)
    df_testing = read_file(testing_file)

    # Combinar los datasets de entrenamiento y testing
    df_train = pd.concat([df_negative, df_positive])
    df_train["category"] = category
    df_testing["category"] = category

    return df_train, df_testing

In [11]:
# Procesar cada categoría

print("Starting processing...")

# Kitchen
df_kitchen, df_kitchen_testing = process_category(
    "kitchen",
    "data/MultiDomainSentiment/processed_acl/kitchen/negative.review",
    "data/MultiDomainSentiment/processed_acl/kitchen/positive.review",
    "data/MultiDomainSentiment/processed_acl/kitchen/unlabeled.review"
)


Starting processing...

Processing kitchen (1/3)
Reading file: data/MultiDomainSentiment/processed_acl/kitchen/negative.review

Processing kitchen (2/3)
Reading file: data/MultiDomainSentiment/processed_acl/kitchen/positive.review

Processing kitchen (3/3)
Reading file: data/MultiDomainSentiment/processed_acl/kitchen/unlabeled.review


In [12]:

# Books
df_books, df_books_testing = process_category(
    "books",
    "data/MultiDomainSentiment/processed_acl/books/negative.review",
    "data/MultiDomainSentiment/processed_acl/books/positive.review",
    "data/MultiDomainSentiment/processed_acl/books/unlabeled.review"
)




Processing books (1/3)
Reading file: data/MultiDomainSentiment/processed_acl/books/negative.review

Processing books (2/3)
Reading file: data/MultiDomainSentiment/processed_acl/books/positive.review

Processing books (3/3)
Reading file: data/MultiDomainSentiment/processed_acl/books/unlabeled.review


In [13]:
# Electronics
df_electronics, df_electronics_testing = process_category(
    "electronics",
    "data/MultiDomainSentiment/processed_acl/electronics/negative.review",
    "data/MultiDomainSentiment/processed_acl/electronics/positive.review",
    "data/MultiDomainSentiment/processed_acl/electronics/unlabeled.review"
)




Processing electronics (1/3)
Reading file: data/MultiDomainSentiment/processed_acl/electronics/negative.review

Processing electronics (2/3)
Reading file: data/MultiDomainSentiment/processed_acl/electronics/positive.review

Processing electronics (3/3)
Reading file: data/MultiDomainSentiment/processed_acl/electronics/unlabeled.review


In [14]:
# DVD
df_dvd, df_dvd_testing = process_category(
    "dvd",
    "data/MultiDomainSentiment/processed_acl/dvd/negative.review",
    "data/MultiDomainSentiment/processed_acl/dvd/positive.review",
    "data/MultiDomainSentiment/processed_acl/dvd/unlabeled.review"
)




Processing dvd (1/3)
Reading file: data/MultiDomainSentiment/processed_acl/dvd/negative.reviewte

Processing dvd (2/3)
Reading file: data/MultiDomainSentiment/processed_acl/dvd/positive.reviewte

Processing dvd (3/3)
Reading file: data/MultiDomainSentiment/processed_acl/dvd/unlabeled.reviewte


Se hace la unión de datasets de entrenamiento

In [15]:
# Unir datasets de entrenamiento
df_training_categories = pd.concat([df_kitchen, df_books, df_electronics, df_dvd],ignore_index=True)

# Unir datasets de testing
df_testing = pd.concat([df_kitchen_testing, df_books_testing, df_electronics_testing, df_dvd_testing],ignore_index=True)

print("\nProcessing completed.")


Processing completed.


In [16]:
df_training_categories.to_csv("training_data_categories.csv")

In [17]:
df_testing.to_csv("testing_data_categories.csv")

# Preprocesamiento
Convertir el diccionario en una cadena para realizar una vectorización

In [18]:
# Example data loading
df_train = pd.read_csv('training_data_categories.csv')

# Function to flatten the dictionary into a single string for vectorization
df_train['terms_str'] = df_train['terms'].apply(lambda x: ' '.join([f"{k} " * v for k, v in eval(x).items()]))
df_train['terms_str'] = df_train['terms_str'].apply(lambda x: processor_.preprocessing_pipeline_sentiments(x))


df_test = pd.read_csv('testing_data_categories.csv')
df_test['terms_str'] = df_test['terms'].apply(lambda x: ' '.join([f"{k} " * v for k, v in eval(x).items()]))
df_test['terms_str'] = df_test['terms_str'].apply(lambda x: processor_.preprocessing_pipeline_sentiments(x))

In [19]:
df_test.drop(columns=["Unnamed: 0"],inplace=True)

In [20]:
df_train

Unnamed: 0.1,Unnamed: 0,terms,label,category,terms_str
0,0,"{'right_after': 1, 'guess': 1, 'dog': 1, 'well...",negative,kitchen,right aft guess dog well well fill to work it ...
1,1,"{'manufacturer_suggested': 1, 'your': 2, 'manu...",negative,kitchen,manufacturer suggest manufacturer of the buy b...
2,2,"{'time_it': 1, 'i_can': 1, 'my_cocker': 1, 'ma...",negative,kitchen,time it i can my cock maybe it low depend can ...
3,3,"{'save_your': 1, 'cheaply': 1, 'i_sent': 1, 's...",negative,kitchen,save your cheapli i sent save loose that right...
4,4,"{'acted': 1, 'sound_interesting': 1, 'they_are...",negative,kitchen,act sound interest they ar bark al a coupl cou...
...,...,...,...,...,...
7995,7995,"{'z': 10, 'only': 1, 'course_of': 1, 'no': 5, ...",positive,dvd,z z z z z z z z z z course of help plenti like...
7996,7996,"{'well': 1, 'i': 1, 'interesting_as': 1, 'raid...",positive,dvd,well interesting a raider liked thi oakland ra...
7997,7997,"{'this_movie': 1, 'is_very': 1, 'enjoys_a': 1,...",positive,dvd,this movi is veri enjoys a yet veri very enjoy...
7998,7998,"{'episodes_ommitted': 1, 'show': 2, ""gareth's""...",positive,dvd,episodes ommit show show gareth america tell w...


# Vectorizers

Se insertan todas las oraciones en una lista

In [21]:
texto_procesado = df_train['terms_str'].tolist()
texto_procesado

['right aft guess dog well well fill to work it seem keep other smal work  num working veri care bark cairn small dog time fair sure whi is suppos emit and bark just look either do such a for mi either away emit sure it eith theyre get to car the spray care if does not a persist getting spray my oth sprays a were not look spray if they are just is not my yappi too stubborn doesnt work looks away dog who barker bark we fil fill it not sur work work work sprayed i longit do also seem also do nt small cairn from spray it also yappi persistent bark not such persist terrier isnt work longit just too num  of time and w from wher right work wel very longit well for supposed to when w terriers ar after but keep mi num get stubborn yappy cairn the tim as mani stubborn to suppos where th well right fairly wel nt nt work fair many spray why when barker terri does work not hav from bark away from mani who i work he just barks again doesnt keep spray spray emit seems to i guess',
 'manufacturer sug

## CountVectorizer

Se realiza la vectorización de términos con el método CountVectorizer

In [22]:
df_books = df_train[df_train['category'] == 'books']
df_electronics = df_train[df_train['category'] == 'electronics']
df_dvd = df_train[df_train['category'] == 'dvd']
df_kitchen = df_train[df_train['category'] == 'kitchen']

El vectorizador que se va a usar

In [23]:
vectorizer = CountVectorizer(max_df=0.9, stop_words=stopwords.words('english'))

In [24]:
def process_category(df, vectorizer):
    # Vectorizar los términos
    X_tf = vectorizer.fit_transform(df['terms_str']).toarray()
    # Obtener las etiquetas
    y = df['label'].values
    return X_tf, y

Se procesa el vectorizador por categoría

In [25]:

# Procesar las categorías
X_tf_books, y_books = process_category(df_books, vectorizer)
X_tf_electronics, y_electronics = process_category(df_electronics, vectorizer)
X_tf_dvd, y_dvd = process_category(df_dvd, vectorizer)
X_tf_kitchen, y_kitchen = process_category(df_kitchen, vectorizer)


Debemos hacer el vectorizer dentro de la funcion porque varia por el numero de features que tiene cada df

In [26]:
def train_and_evaluate_with_validation(df, category_name, tfidf: bool= False):
    # Crear el vectorizador dentro de la función para evitar desajustes entre categorías
    if tfidf:
        vectorizer = TfidfVectorizer(max_df=0.9, stop_words=stopwords.words('english'))
    else:
        vectorizer = CountVectorizer(max_df=0.9, stop_words=stopwords.words('english'))
    
    # Vectorizar los términos
    X_tf = vectorizer.fit_transform(df['terms_str']).toarray()
    
    # Obtener las etiquetas
    y = df['label'].values

    # Primera división: 70% (entrenamiento + validación) y 30% (prueba)
    X_temp, X_test, y_temp, y_test = train_test_split(X_tf, y, test_size=0.3, random_state=0)
    
    # Segunda división: del 70% restante, tomar 60% para entrenamiento y 10% para validación
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1/0.7, random_state=0)
    
    # Entrenar Naive Bayes
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    y_pred_nb = nb.predict(X_test)
    
    print(f"################### Starting with category: {category_name} ###################")
    
    # Evaluar Naive Bayes
    print("\n")
    print(f"Resultados para Naive Bayes - {category_name}")
    evaluate_model(y_test, y_pred_nb, "Naive Bayes")
    
    # Entrenar Regresión Logística
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train, y_train)
    y_pred_lr = lr.predict(X_test)
    
    # Evaluar Regresión Logística
    print("\n")
    print(f"Resultados para Regresión Logística - {category_name}")
    evaluate_model(y_test, y_pred_lr, "Logistic Regression")
    
    # Evaluar en el conjunto de validación
    y_pred_val_nb = nb.predict(X_val)
    y_pred_val_lr = lr.predict(X_val)
    
    print("\n")
    print(f"Validación para Naive Bayes - {category_name}")
    evaluate_model(y_val, y_pred_val_nb, "Naive Bayes Validation")
    
    print("\n")
    print(f"Validación para Regresión Logística - {category_name}")
    evaluate_model(y_val, y_pred_val_lr, "Logistic Regression Validation")

    print("\n")
    # Extraer y mostrar las características más importantes en Regresión Logística
    print(f"\nCaracterísticas más importantes para Regresión Logística - {category_name}:")
    top_features, bottom_features = get_top_features_logistic_regression(lr, vectorizer, top_n=10)

    print("\n")
    
    print(f"################### Top 10 Features for {category_name} ###################")
    
    print("Top 10 características más importantes (asociadas con clase positiva):")
    for feature, coef in top_features:
        print(f"{feature}: {coef}")

    print("\n")
    print("\nTop 10 características menos importantes (asociadas con clase negativa):")
    for feature, coef in bottom_features:
        print(f"{feature}: {coef}")
    
    print("\n")

def evaluate_model(y_test, y_pred, model_name):
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label='positive')
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Evaluación de {model_name}:\n Precision: {precision}\n Recall: {recall}\n F1: {f1}\n Accuracy: {accuracy}\n")

def get_top_features_logistic_regression(lr_model, vectorizer, top_n=10):
    """
    Obtiene las características más importantes de un modelo de Regresión Logística.
    
    Args:
    lr_model: El modelo de regresión logística entrenado.
    vectorizer: El vectorizador utilizado (CountVectorizer o TfidfVectorizer).
    top_n: Número de características más importantes a extraer.
    
    Returns:
    Una lista de las características más importantes y sus coeficientes.
    """
    # Obtener los coeficientes del modelo
    coef = lr_model.coef_[0]
    
    # Obtener los nombres de las características desde el vectorizador
    feature_names = vectorizer.get_feature_names_out()
    
    # Asegurarse de que el número de coeficientes y características coincida
    if len(coef) != len(feature_names):
        print(f"Error: Mismatch between coefficients and feature names. "
              f"len(coef): {len(coef)}, len(feature_names): {len(feature_names)}")
        return [], []

    # Ordenar los coeficientes por su valor absoluto para obtener las características más importantes
    sorted_idx = np.argsort(coef)[::-1]
    
    # Obtener las top_n características más importantes
    top_features = [(feature_names[i], coef[i]) for i in sorted_idx[:top_n]]
    bottom_features = [(feature_names[i], coef[i]) for i in sorted_idx[-top_n:]]
    
    return top_features, bottom_features


Se entrenan y evalúan diferentes conjuntos de datos por clase

In [27]:


# Libros
train_and_evaluate_with_validation(df_books,'books')

# Electrónica
train_and_evaluate_with_validation(df_electronics,'electronics')

# DVD
train_and_evaluate_with_validation(df_dvd,'dvd')

# Cocina
train_and_evaluate_with_validation(df_kitchen,'kitchen')


################### Starting with category: books ###################


Resultados para Naive Bayes - books
Evaluación de Naive Bayes:
 Precision: 0.7608695652173914
 Recall: 0.7191780821917808
 F1: 0.7394366197183099
 Accuracy: 0.7533333333333333



Resultados para Regresión Logística - books
Evaluación de Logistic Regression:
 Precision: 0.7832167832167832
 Recall: 0.7671232876712328
 F1: 0.7750865051903114
 Accuracy: 0.7833333333333333



Validación para Naive Bayes - books
Evaluación de Naive Bayes Validation:
 Precision: 0.7816091954022989
 Recall: 0.7311827956989247
 F1: 0.7555555555555555
 Accuracy: 0.7810945273631841



Validación para Regresión Logística - books
Evaluación de Logistic Regression Validation:
 Precision: 0.7684210526315789
 Recall: 0.7849462365591398
 F1: 0.776595744680851
 Accuracy: 0.7910447761194029




Características más importantes para Regresión Logística - books:


################### Top 10 Features for books ###################
Top 10 características m

## Método de Vectorización Matriz Tf-idf

Se realiza la vectorización de términos con el método de la matriz Tf-idf

In [28]:
tfidf = TfidfVectorizer(max_df=0.9, stop_words=stopwords.words('english'))


Se dividen los datos por categoría

In [29]:
X_tf_books, y_books = process_category(df_books, tfidf)
X_tf_electronics, y_electronics = process_category(df_electronics, tfidf)
X_tf_dvd, y_dvd = process_category(df_dvd, tfidf)
X_tf_kitchen, y_kitchen = process_category(df_kitchen, tfidf)

Se realiza entrenamiento, validación y cálculo de métricas para cada categoría

In [30]:

# Libros
train_and_evaluate_with_validation(df_books,'books', tfidf=True)

# Electrónica
train_and_evaluate_with_validation(df_electronics,'electronics',tfidf=True)

# DVD
train_and_evaluate_with_validation(df_dvd,'dvd',tfidf=True)

# Cocina
train_and_evaluate_with_validation(df_kitchen,'kitchen',tfidf=True)


################### Starting with category: books ###################


Resultados para Naive Bayes - books
Evaluación de Naive Bayes:
 Precision: 0.7785234899328859
 Recall: 0.7945205479452054
 F1: 0.7864406779661017
 Accuracy: 0.79



Resultados para Regresión Logística - books
Evaluación de Logistic Regression:
 Precision: 0.7368421052631579
 Recall: 0.815068493150685
 F1: 0.7739837398373983
 Accuracy: 0.7683333333333333



Validación para Naive Bayes - books
Evaluación de Naive Bayes Validation:
 Precision: 0.7684210526315789
 Recall: 0.7849462365591398
 F1: 0.776595744680851
 Accuracy: 0.7910447761194029



Validación para Regresión Logística - books
Evaluación de Logistic Regression Validation:
 Precision: 0.7211538461538461
 Recall: 0.8064516129032258
 F1: 0.7614213197969543
 Accuracy: 0.7661691542288557




Características más importantes para Regresión Logística - books:


################### Top 10 Features for books ###################
Top 10 características más importantes 

# Evaluación con Prueba

Se realiza la evaluación de cada modelo calculado anteriormente con el conjunto de prueba

In [31]:
import numpy as np

def train_and_evaluate_with_test_data(df_train, df_test, category_name, tfidf: bool = False):
    # Crear el vectorizador dentro de la función
    if tfidf:
        vectorizer = TfidfVectorizer(max_df=0.9, stop_words=stopwords.words('english'))
    else:
        vectorizer = CountVectorizer(max_df=0.9, stop_words=stopwords.words('english'))

    # Vectorizar los términos de entrenamiento
    X_train = vectorizer.fit_transform(df_train['terms_str']).toarray()
    y_train = df_train['label'].values

    # Transformar los datos de prueba con el vectorizer entrenado
    X_test = vectorizer.transform(df_test['terms_str']).toarray()
    y_test = df_test['label'].values  

    # Entrenar Naive Bayes
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    y_pred_nb = nb.predict(X_test)
    
    print(f"################### Starting with category: {category_name} ###################")
    
    # Evaluar Naive Bayes
    print("\n")
    print(f"Resultados para Naive Bayes - {category_name}")
    evaluate_model(y_test, y_pred_nb, "Naive Bayes")
    
    # Entrenar Regresión Logística
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train, y_train)
    y_pred_lr = lr.predict(X_test)
    
    # Evaluar Regresión Logística
    print("\n")
    print(f"Resultados para Regresión Logística - {category_name}")
    evaluate_model(y_test, y_pred_lr, "Logistic Regression")

    print("\n")
    # Extraer y mostrar las características más importantes en Regresión Logística
    print(f"\nCaracterísticas más importantes para Regresión Logística - {category_name}:")
    top_features, bottom_features = get_top_features_logistic_regression(lr, vectorizer, top_n=10)

    print("\n")
    
    print(f"################### Top 10 Features for {category_name} ###################")
    
    print("Top 10 características más importantes (asociadas con clase positiva):")
    for feature, coef in top_features:
        print(f"{feature}: {coef}")

    print("\n")
    print("\nTop 10 características menos importantes (asociadas con clase negativa):")
    for feature, coef in bottom_features:
        print(f"{feature}: {coef}")
    
    print("\n")


def evaluate_model(y_test, y_pred, model_name):
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', pos_label='positive')
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Evaluación de {model_name}:\n Precision: {precision}\n Recall: {recall}\n F1: {f1}\n Accuracy: {accuracy}\n")


def get_top_features_logistic_regression(lr_model, vectorizer, top_n=10):
    """
    Obtiene las características más importantes de un modelo de Regresión Logística.
    
    Args:
    lr_model: El modelo de regresión logística entrenado.
    vectorizer: El vectorizador utilizado (CountVectorizer o TfidfVectorizer).
    top_n: Número de características más importantes a extraer.
    
    Returns:
    Una lista de las características más importantes y sus coeficientes.
    """
    # Obtener los coeficientes del modelo
    coef = lr_model.coef_[0]
    
    # Obtener los nombres de las características desde el vectorizador
    feature_names = vectorizer.get_feature_names_out()
    
    # Ordenar los coeficientes por su valor absoluto para obtener las características más importantes
    sorted_idx = np.argsort(coef)[::-1]
    
    # Obtener las top_n características más importantes
    top_features = [(feature_names[i], coef[i]) for i in sorted_idx[:top_n]]
    bottom_features = [(feature_names[i], coef[i]) for i in sorted_idx[-top_n:]]
    
    return top_features, bottom_features


### Usando matriz Tf

In [32]:
# Libros
train_and_evaluate_with_test_data(df_books, df_test[df_test['category'] == 'books'], "books")

# Electrónica
train_and_evaluate_with_test_data(df_electronics, df_test[df_test['category'] == 'electronics'], "electronics")
# DVD
train_and_evaluate_with_test_data(df_dvd, df_test[df_test['category'] == 'dvd'], "dvd")
# Cocina
train_and_evaluate_with_test_data(df_kitchen, df_test[df_test['category'] == 'kitchen'], "kitchen")

################### Starting with category: books ###################


Resultados para Naive Bayes - books
Evaluación de Naive Bayes:
 Precision: 0.8090909090909091
 Recall: 0.7469081272084805
 F1: 0.776757005052825
 Accuracy: 0.7823068309070549



Resultados para Regresión Logística - books
Evaluación de Logistic Regression:
 Precision: 0.8053020425901782
 Recall: 0.8184628975265018
 F1: 0.811829134720701
 Accuracy: 0.8076147816349384




Características más importantes para Regresión Logística - books:


################### Top 10 Features for books ###################
Top 10 características más importantes (asociadas con clase positiva):
excel: 0.7839597111473018
easi: 0.6207946159790958
excellent: 0.580636392209854
enjoy: 0.573941294947134
blood: 0.5671711631231606
fast: 0.5393426714701102
profound: 0.5141668415190466
entertain: 0.5072312846963648
superbookd: 0.494831571996692
straight: 0.48673881950537495



Top 10 características menos importantes (asociadas con clase negativa):

### Usando Matriz Tf-idf

In [33]:
# Libros
train_and_evaluate_with_test_data(df_books, df_test[df_test['category'] == 'books'], "books",tfidf=True)

# Electrónica
train_and_evaluate_with_test_data(df_electronics, df_test[df_test['category'] == 'electronics'], "electronics",tfidf=True)
# DVD
train_and_evaluate_with_test_data(df_dvd, df_test[df_test['category'] == 'dvd'], "dvd",tfidf=True)
# Cocina
train_and_evaluate_with_test_data(df_kitchen, df_test[df_test['category'] == 'kitchen'], "kitchen",tfidf=True)

################### Starting with category: books ###################


Resultados para Naive Bayes - books
Evaluación de Naive Bayes:
 Precision: 0.8429915799900941
 Recall: 0.7517667844522968
 F1: 0.7947700210133084
 Accuracy: 0.8031354983202688



Resultados para Regresión Logística - books
Evaluación de Logistic Regression:
 Precision: 0.8281318681318681
 Recall: 0.8321554770318021
 F1: 0.8301387970918704
 Accuracy: 0.8273236282194849




Características más importantes para Regresión Logística - books:


################### Top 10 Features for books ###################
Top 10 características más importantes (asociadas con clase positiva):
great: 2.5932143880566434
excel: 1.9667685038158151
recommend: 1.9201536279015397
best: 1.8958749504638264
love: 1.71274940597773
enjoy: 1.5271521413329312
easi: 1.4707771669817546
favorit: 1.4489303395976219
must: 1.3394091591758708
excellent: 1.1617073735243046



Top 10 características menos importantes (asociadas con clase negativa):
wa: -1.3

# Sin categorías


Se realiza el entrenamiento como si el conjunto de entrenamiento y prueba fueran uno y sin categorías

### Con datos de validación

In [34]:
df_train.drop(columns=["Unnamed: 0"],inplace=True)
df_train

Unnamed: 0,terms,label,category,terms_str
0,"{'right_after': 1, 'guess': 1, 'dog': 1, 'well...",negative,kitchen,right aft guess dog well well fill to work it ...
1,"{'manufacturer_suggested': 1, 'your': 2, 'manu...",negative,kitchen,manufacturer suggest manufacturer of the buy b...
2,"{'time_it': 1, 'i_can': 1, 'my_cocker': 1, 'ma...",negative,kitchen,time it i can my cock maybe it low depend can ...
3,"{'save_your': 1, 'cheaply': 1, 'i_sent': 1, 's...",negative,kitchen,save your cheapli i sent save loose that right...
4,"{'acted': 1, 'sound_interesting': 1, 'they_are...",negative,kitchen,act sound interest they ar bark al a coupl cou...
...,...,...,...,...
7995,"{'z': 10, 'only': 1, 'course_of': 1, 'no': 5, ...",positive,dvd,z z z z z z z z z z course of help plenti like...
7996,"{'well': 1, 'i': 1, 'interesting_as': 1, 'raid...",positive,dvd,well interesting a raider liked thi oakland ra...
7997,"{'this_movie': 1, 'is_very': 1, 'enjoys_a': 1,...",positive,dvd,this movi is veri enjoys a yet veri very enjoy...
7998,"{'episodes_ommitted': 1, 'show': 2, ""gareth's""...",positive,dvd,episodes ommit show show gareth america tell w...


In [35]:
df_train.shape

(8000, 4)

In [36]:
df_test

Unnamed: 0,terms,label,category,terms_str
0,"{'i_forget': 1, 'is_no': 1, 'no_special': 1, '...",negative,kitchen,i forget is no no speci old old messi problem ...
1,"{'lasted_less': 1, 'a_chance': 1, 'chance_to':...",negative,kitchen,lasted less a chanc chance to the motor get bu...
2,"{'cooper_cooler': 1, 'bottles': 1, 'i': 1, 'co...",positive,kitchen,cooper cool bottl cooler 23 min cans num soda...
3,"{'the_idea': 1, 'quick_marinate': 1, 'to_clean...",negative,kitchen,the idea quick marin to cleanup container but ...
4,"{'small_i': 1, 'though_only': 1, 'craft_i': 1,...",negative,kitchen,small i though on craft i full grip my husband...
...,...,...,...,...
19672,"{'order': 1, 'kid': 1, 'up_with""kid\'s': 1, 's...",positive,dvd,order kid up with kid mani num to right so ma...
19673,"{'stage_screaming': 1, 'telling_me': 1, 'to_te...",positive,dvd,stage scream telling m to tel you th ways and ...
19674,"{'kyra': 1, 'to_see': 1, 'witnessed_a': 1, 'st...",positive,dvd,kyra to se witnessed a steps to step women and...
19675,"{'bucks': 1, 'live': 1, 'film': 1, 'income_is'...",positive,dvd,buck live film income i its just just not simp...


In [37]:
df_test.shape

(19677, 4)

### Matriz Tf

Entrenar y evaluar con conjunto de validación

In [38]:
train_and_evaluate_with_validation(df_train,"Without Categories: Categories unified")

################### Starting with category: Without Categories: Categories unified ###################


Resultados para Naive Bayes - Without Categories: Categories unified
Evaluación de Naive Bayes:
 Precision: 0.7881205673758865
 Recall: 0.7540288379983037
 F1: 0.7706978760294755
 Accuracy: 0.7795833333333333



Resultados para Regresión Logística - Without Categories: Categories unified
Evaluación de Logistic Regression:
 Precision: 0.7882256745707277
 Recall: 0.8176420695504665
 F1: 0.8026644462947544
 Accuracy: 0.8025



Validación para Naive Bayes - Without Categories: Categories unified
Evaluación de Naive Bayes Validation:
 Precision: 0.7837837837837838
 Recall: 0.7571801566579635
 F1: 0.7702523240371846
 Accuracy: 0.784019975031211



Validación para Regresión Logística - Without Categories: Categories unified
Evaluación de Logistic Regression Validation:
 Precision: 0.7963917525773195
 Recall: 0.8067885117493473
 F1: 0.8015564202334631
 Accuracy: 0.8089887640449438




Carac

### Matriz Tf-idf

Entrenar y evaluar con conjunto de validación

In [39]:
train_and_evaluate_with_validation(df_train,"Without Categories: Categories unified",True)

################### Starting with category: Without Categories: Categories unified ###################


Resultados para Naive Bayes - Without Categories: Categories unified
Evaluación de Naive Bayes:
 Precision: 0.789093825180433
 Recall: 0.8346055979643766
 F1: 0.8112118713932399
 Accuracy: 0.8091666666666667



Resultados para Regresión Logística - Without Categories: Categories unified
Evaluación de Logistic Regression:
 Precision: 0.8127090301003345
 Recall: 0.8244274809160306
 F1: 0.8185263157894737
 Accuracy: 0.8204166666666667



Validación para Naive Bayes - Without Categories: Categories unified
Evaluación de Naive Bayes Validation:
 Precision: 0.7950617283950617
 Recall: 0.8407310704960835
 F1: 0.817258883248731
 Accuracy: 0.8202247191011236



Validación para Regresión Logística - Without Categories: Categories unified
Evaluación de Logistic Regression Validation:
 Precision: 0.8203125
 Recall: 0.8224543080939948
 F1: 0.8213820078226858
 Accuracy: 0.8289637952559301




Car

## Con Conjunto de Prueba

### Matriz Tf

Entrenar y evaluar con conjunto de prueba

In [40]:
train_and_evaluate_with_test_data(df_train,df_test,"Without Categories: Categories unified")

################### Starting with category: Without Categories: Categories unified ###################


Resultados para Naive Bayes - Without Categories: Categories unified
Evaluación de Naive Bayes:
 Precision: 0.8312050649211289
 Recall: 0.7838494231936854
 F1: 0.8068329774490912
 Accuracy: 0.81150581897647



Resultados para Regresión Logística - Without Categories: Categories unified
Evaluación de Logistic Regression:
 Precision: 0.8273424275614856
 Recall: 0.8408216960129529
 F1: 0.8340276035131744
 Accuracy: 0.8319357625654317




Características más importantes para Regresión Logística - Without Categories: Categories unified:


################### Top 10 Features for Without Categories: Categories unified ###################
Top 10 características más importantes (asociadas con clase positiva):
works: 1.6190865154022773
excellent: 1.1127049147237236
excel: 1.095567090811617
perfect: 0.9987833767272752
fantast: 0.9318937202848815
amaz: 0.9211053605760529
glad: 0.838465628638424

### Matriz tf-idf

Entrenar y evaluar con conjunto de prueba

In [41]:
train_and_evaluate_with_test_data(df_train,df_test,"Without Categories: Categories unified",True)

################### Starting with category: Without Categories: Categories unified ###################


Resultados para Naive Bayes - Without Categories: Categories unified
Evaluación de Naive Bayes:
 Precision: 0.8497035040431267
 Recall: 0.7975106253794778
 F1: 0.8227801847888501
 Accuracy: 0.8274635361081466



Resultados para Regresión Logística - Without Categories: Categories unified
Evaluación de Logistic Regression:
 Precision: 0.8458509993943065
 Recall: 0.8479052823315119
 F1: 0.8468768950879321
 Accuracy: 0.8460131117548407




Características más importantes para Regresión Logística - Without Categories: Categories unified:


################### Top 10 Features for Without Categories: Categories unified ###################
Top 10 características más importantes (asociadas con clase positiva):
great: 6.852423318183876
best: 5.129352590617112
excel: 4.551816444789013
love: 4.434083705310246
easi: 4.2812932838986
perfect: 4.218812664750223
enjoy: 3.2999035214096137
excellent: