In [1]:
import pandas as pd
import time

In [2]:
def print_progress_bar(percentage):
    bar_length = 50  # Definir la longitud de la barra de progreso
    filled_length = int(bar_length * percentage // 100)
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    print(f'\rProgress: |{bar}| {percentage}% Complete', end='\r')

def log_category_progress(category, step, total_steps):
    percentage = int((step / total_steps) * 100)
    print(f"\nProcessing {category} ({step}/{total_steps})")
    print_progress_bar(percentage)
    time.sleep(0.5)  # Solo para simular el tiempo de procesamiento

def read_file(file_location) -> pd.DataFrame:
    print(f"Reading file: {file_location}")
    with open(file_location, 'r') as file:
        data = file.readlines()

    rows = []
    for i in range(0, len(data)):
        terms = {}
        # Separar la parte de los términos del label
        line_parts = data[i].split(" #label#:")
        terms_part = line_parts[0].split()  # Parte con los términos
        label = line_parts[1].strip()  # El label (positive o negative)

        # Iterar sobre los términos y extraer el valor
        for j in terms_part:
            if ":" in j:
                term, value = j.split(":")
                terms[term] = int(value)

        # Crear un diccionario para la fila con el formato { 'terms': {términos}, 'label': label }
        rows.append({
            "terms": terms,  # El diccionario con los términos y sus valores
            "label": label   # El label ('positive' o 'negative')
        })

    # Crear un DataFrame con dos columnas: 'terms' y 'label'
    dataFrame = pd.DataFrame(rows)

    return dataFrame

In [3]:
df_train = read_file("data/MultiDomainSentiment/negative.review")
df_train

Reading file: data/MultiDomainSentiment/negative.review


Unnamed: 0,terms,label
0,"{'avid': 1, 'your': 1, 'horrible_book': 1, 'wa...",negative
1,"{'to_use': 1, 'shallow': 1, 'found': 1, 'he_ca...",negative
2,"{'avid': 1, 'your': 1, 'horrible_book': 1, 'wa...",negative
3,"{'book_seriously': 1, 'we': 1, 'days_couldn't'...",negative
4,"{'mass': 1, 'only': 1, 'he': 2, 'help': 1, '""j...",negative
...,...,...
995,"{'only': 1, 'idiotic_anyone': 1, 'if_i': 1, 'm...",negative
996,"{'your': 1, 'well': 1, 'to_create': 1, 'peter'...",negative
997,"{'favorable_reviews': 1, 'heard': 1, 'straight...",negative
998,"{'helpful': 1, 'this_one': 1, 'substance_and':...",negative


In [4]:
df_train.to_csv("negative.review.csv")

In [5]:
df_validation = read_file("data/MultiDomainSentiment/unlabeled.review")
df_validation

Reading file: data/MultiDomainSentiment/unlabeled.review


Unnamed: 0,terms,label
0,"{'is_such': 1, 'feel': 1, 'pages': 1, 'if': 1,...",negative
1,"{'go_mercy': 1, 'forget': 1, 'all_the': 1, 'pi...",negative
2,"{'explanation_of': 1, 'plains': 1, 'bison': 5,...",positive
3,"{'stars': 1, 'bold_kudos': 1, 'every_pedophile...",positive
4,"{'doesn't_say': 1, 'their_class': 1, 'say_much...",negative
...,...,...
4460,"{'mass': 1, 'specifically_references': 1, 'sci...",negative
4461,"{'reviewer's_comments': 1, 'to_pick': 1, 'comm...",negative
4462,"{'x-ers': 1, 'entry-level': 1, 'can_dip': 1, '...",positive
4463,"{'your': 1, 'well': 1, 'around_for': 1, 'you'r...",positive


In [6]:
df_validation.to_csv("unlabeled.review.csv")

In [15]:
def process_category(category, negative_file, positive_file, testing_file):
    total_steps = 3
    step = 1

    log_category_progress(category, step, total_steps)
    df_negative = read_file(negative_file)

    step += 1
    log_category_progress(category, step, total_steps)
    df_positive = read_file(positive_file)

    step += 1
    log_category_progress(category, step, total_steps)
    df_testing = read_file(testing_file)

    # Combinar los datasets de entrenamiento y testing
    df_train = pd.concat([df_negative, df_positive])
    df_train["category"] = category
    df_testing["category"] = category

    return df_train, df_testing

In [16]:
# Procesar cada categoría

print("Starting processing...")

# Kitchen
df_kitchen, df_kitchen_testing = process_category(
    "kitchen",
    "data/MultiDomainSentiment/processed_acl/kitchen/negative.review",
    "data/MultiDomainSentiment/processed_acl/kitchen/positive.review",
    "data/MultiDomainSentiment/processed_acl/kitchen/unlabeled.review"
)


Starting processing...

Processing kitchen (1/3)
Reading file: data/MultiDomainSentiment/processed_acl/kitchen/negative.review

Processing kitchen (2/3)
Reading file: data/MultiDomainSentiment/processed_acl/kitchen/positive.review

Processing kitchen (3/3)
Reading file: data/MultiDomainSentiment/processed_acl/kitchen/unlabeled.review


In [17]:

# Books
df_books, df_books_testing = process_category(
    "books",
    "data/MultiDomainSentiment/processed_acl/books/negative.review",
    "data/MultiDomainSentiment/processed_acl/books/positive.review",
    "data/MultiDomainSentiment/processed_acl/books/unlabeled.review"
)




Processing books (1/3)
Reading file: data/MultiDomainSentiment/processed_acl/books/negative.review

Processing books (2/3)
Reading file: data/MultiDomainSentiment/processed_acl/books/positive.review

Processing books (3/3)
Reading file: data/MultiDomainSentiment/processed_acl/books/unlabeled.review


In [18]:
# Electronics
df_electronics, df_electronics_testing = process_category(
    "electronics",
    "data/MultiDomainSentiment/processed_acl/electronics/negative.review",
    "data/MultiDomainSentiment/processed_acl/electronics/positive.review",
    "data/MultiDomainSentiment/processed_acl/electronics/unlabeled.review"
)




Processing electronics (1/3)
Reading file: data/MultiDomainSentiment/processed_acl/electronics/negative.review

Processing electronics (2/3)
Reading file: data/MultiDomainSentiment/processed_acl/electronics/positive.review

Processing electronics (3/3)
Reading file: data/MultiDomainSentiment/processed_acl/electronics/unlabeled.review


In [19]:
# DVD
df_dvd, df_dvd_testing = process_category(
    "dvd",
    "data/MultiDomainSentiment/processed_acl/dvd/negative.review",
    "data/MultiDomainSentiment/processed_acl/dvd/positive.review",
    "data/MultiDomainSentiment/processed_acl/dvd/unlabeled.review"
)




Processing dvd (1/3)
Reading file: data/MultiDomainSentiment/processed_acl/dvd/negative.reviewte

Processing dvd (2/3)
Reading file: data/MultiDomainSentiment/processed_acl/dvd/positive.reviewte

Processing dvd (3/3)
Reading file: data/MultiDomainSentiment/processed_acl/dvd/unlabeled.reviewte


In [27]:
# Unir datasets de entrenamiento
df_training_categories = pd.concat([df_kitchen, df_books, df_electronics, df_dvd],ignore_index=True)

# Unir datasets de testing
df_testing = pd.concat([df_kitchen_testing, df_books_testing, df_electronics_testing, df_dvd_testing],ignore_index=True)

print("\nProcessing completed.")


Processing completed.


In [38]:
df_training_categories.to_csv("training_data_categories.csv")

In [39]:
df_testing.to_csv("testing_data_categories.csv")