In [None]:
import pandas as pd
import random
import numpy as np

In [None]:
def make_data_dirty(df, missing_value_ratio=0.05, outlier_ratio=0.01):
    """
    A function to make the data 'dirty' by adding missing values (NaN)
    randomly and adding outliers to some numeric columns.

    Args:
        df (pd.DataFrame): The DataFrame to be made dirty.
        missing_value_ratio (float): The proportion of values to be changed to NaN (0.0 - 1.0).
        outlier_ratio (float): The proportion of values to be changed to outliers (0.0 - 1.0).

    Returns:
        pd.DataFrame: The dirty DataFrame.
    """
    df_copy = df.copy()
    num_rows, num_cols = df_copy.shape

    # 1. Add missing values (NaN) randomly
    num_missing = int(missing_value_ratio * num_rows * num_cols)
    cells_to_null = random.sample(range(num_rows * num_cols), num_missing)

    for idx in cells_to_null:
        row_idx = idx // num_cols
        col_idx = idx % num_cols
        df_copy.iloc[row_idx, col_idx] = np.nan

    print(f"Number of missing values (NaN) added: {num_missing}")

    # 2. Add outliers to some numeric columns (manually selected)
    numeric_cols = df_copy.select_dtypes(include=np.number).columns.tolist()
    cols_to_modify_outlier = []

    # Manual selection of numeric columns to modify for outliers
    potential_outlier_cols = [
        "price",
        "freight_value",
        "payment_value",
        "product_weight_g",
        "product_length_cm",
        "product_height_cm",
        "product_width_cm",
        "geolocation_lat",
        "geolocation_lng",
        "payment_installments",
        "review_score",
        "order_item_id",
        "customer_zip_code_prefix",
        "seller_zip_code_prefix",
    ]

    for col in potential_outlier_cols:
        if col in numeric_cols:
            cols_to_modify_outlier.append(col)

    for col in cols_to_modify_outlier:
        num_outliers = int(outlier_ratio * len(df_copy))
        indices_to_modify = random.sample(df_copy.index.tolist(), num_outliers)

        for idx in indices_to_modify:
            original_dtype = df_copy[col].dtype

            if random.random() < 0.5:
                # Add over value
                over_value = df_copy[col].max() * (1 + random.random()) * 5
                if pd.api.types.is_integer_dtype(original_dtype):
                    df_copy.loc[idx, col] = int(over_value)
                else:
                    df_copy.loc[idx, col] = over_value
            else:
                # Add under value
                under_value = df_copy[col].min() * (random.random() * 0.1)
                if pd.api.types.is_integer_dtype(original_dtype):
                    df_copy.loc[idx, col] = int(under_value)
                else:
                    df_copy.loc[idx, col] = under_value

        print(f"Number of outliers added in column '{col}': {num_outliers}")

    return df_copy

In [None]:
customer_df = pd.read_csv("../data/customers_dataset.csv")
geolocation_df = pd.read_csv("../data/geolocation_dataset.csv")
order_item_df = pd.read_csv("../data/order_items_dataset.csv")
order_payments_df = pd.read_csv("../data/order_payments_dataset.csv")
order_reviews_df = pd.read_csv("../data/order_reviews_dataset.csv")
orders_df = pd.read_csv("../data/orders_dataset.csv")
product_category_name_df = pd.read_csv("../data/product_category.csv")
products_df = pd.read_csv("../data/products_dataset.csv")
seller_df = pd.read_csv("../data/sellers_dataset.csv")

In [None]:
customer_df_dirty = make_data_dirty(customer_df)
geolocation_df_dirty = make_data_dirty(geolocation_df)
order_item_df_dirty = make_data_dirty(order_item_df)
order_payment_df_dirty = make_data_dirty(order_payments_df)
order_review_df_dirty = make_data_dirty(order_reviews_df)
orders_df_dirty = make_data_dirty(orders_df)
product_category_name_translation_df_dirty = make_data_dirty(product_category_name_df)
products_df_dirty = make_data_dirty(products_df)
seller_df_dirty = make_data_dirty(seller_df)

In [None]:
# List berisi nama DataFrame yang udah dikotori
dirty_dfs = {
    "customer_data_dirty": customer_df_dirty,
    "geolocation_dirty": geolocation_df_dirty,
    "order_item_dirty": order_payment_df_dirty,
    "order_payment_dirty": order_payment_df_dirty,
    "order_review_dirty": order_review_df_dirty,
    "orders_dataset_dirty": orders_df_dirty,
    "product_category_name_translation_dirty": product_category_name_translation_df_dirty,
    "products_dirty": products_df_dirty,
    "seller_dirty": seller_df_dirty,
}

# Looping buat save setiap DataFrame ke file CSV
for name, df in dirty_dfs.items():
    filepath = f"../dataset/{name}.csv"
    df.to_csv(filepath, index=False)
    print(f"DataFrame '{name}' berhasil disimpan ke '{filepath}'")

print("\nSemua data yang udah dikotori udah disimpan!")