## Clean Review Data

Cleans the generated review data.

In [26]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [27]:
review_data_path = "reviews"
review_data_files = [os.path.join(review_data_path, f) for f in os.listdir(review_data_path) if f.endswith(".csv")]

In [28]:
# Read all the files and if they are empty add the header row to them
headers = ["id", "reviewedAt", "rating", "isMarkedHelpful", "reviewText", "productCompleted", "mostHelpfulVoteCount", "users", "user_id"]
for file_path in review_data_files:
    df = pd.read_csv(file_path, header=None, names=headers)
    if df.empty:
        df.to_csv(file_path, index=False, header=headers)


In [29]:
def clean_data(df):
    """Cleans review data by modifying column data inplace.

    Parameters
    ----------
    df : pandas.DataFrame
        The Dataframe to clean.
    """
    expected_cols = ["id", "reviewedAt", "rating", "isMarkedHelpful", "reviewText", "productCompleted", "mostHelpfulVoteCount", "users", "user_id"]
    for col in expected_cols:
        if col not in df.columns:
            raise ValueError(f"Column {col} is not in the DataFrame.")

    df["reviewText"] = df["reviewText"].apply(clean_review_text)

    df["mostHelpfulVoteCount"] = df["mostHelpfulVoteCount"].apply(clean_most_helpful_votes)
    df["mostHelpfulVoteCount"] = df["mostHelpfulVoteCount"].astype(int)

    df["isMarkedHelpful"] = df["isMarkedHelpful"].apply(clean_is_marked_helpful)
    df["productCompleted"] = df["productCompleted"].apply(clean_product_completed)

def clean_most_helpful_votes(vote):
    """Cleans most helpful vote number."""
    return 0 if pd.isnull(vote) else vote


def clean_review_text(text):
    """Cleans review text."""
    # remove html tags 
    text = BeautifulSoup(text, "html.parser").get_text()
    return text


def clean_is_marked_helpful(text):
    """Cleans is marked helpful text."""
    return 1 if text == "True" else 0

def clean_product_completed(text):
    """Cleans product completed text."""
    return 1 if text == "True" else 0

In [30]:
for file_path in review_data_files:
    df = pd.read_csv(file_path)
    clean_data(df)
    df.to_csv(file_path, index=False)