# Script python for make dirty data and remove rows randomly

In [4]:
import pandas as pd
import random
import numpy as np
import os

## function for make data dirty

In [5]:
def make_data_dirty(df, missing_value_ratio=0.06, outlier_ratio=0.05):
    """
    A function to make the data 'dirty' by adding missing values (NaN)
    randomly (excluding ID columns) and adding outliers to some numeric columns.

    Args:
        df (pd.DataFrame): The DataFrame to be made dirty.
        missing_value_ratio (float): The proportion of values to be changed to NaN (0.0 - 1.0).
        outlier_ratio (float): The proportion of values to be changed to outliers (0.0 - 1.0).

    Returns:
        pd.DataFrame: The dirty DataFrame.
    """
    df_copy = df.copy()
    num_rows, num_cols = df_copy.shape

    # Kolom-kolom ID yang tidak boleh diubah jadi NaN
    id_columns = [
        "customer_id",
        "customer_unique_id",
        "order_id",
        "product_id",
        "seller_id",
        "review_id",
    ]

    # 1. Add missing values (NaN) randomly, excluding ID columns
    non_id_columns = [col for col in df_copy.columns if col not in id_columns]
    num_non_id_cols = len(non_id_columns)
    if num_non_id_cols > 0:
        num_missing = int(missing_value_ratio * num_rows * num_non_id_cols)
        cells_to_null = random.sample(range(num_rows * num_non_id_cols), num_missing)

        for idx in cells_to_null:
            row_idx = idx // num_non_id_cols
            col_name = non_id_columns[idx % num_non_id_cols]
            df_copy.loc[row_idx, col_name] = np.nan

        print(
            f"Number of missing values (NaN) added (excluding ID columns): {num_missing}"
        )
    else:
        print("No non-ID columns found to add missing values.")

    # 2. Add outliers to some numeric columns (manually selected)
    numeric_cols = df_copy.select_dtypes(include=np.number).columns.tolist()
    cols_to_modify_outlier = []

    # Manual selection of numeric columns to modify for outliers
    potential_outlier_cols = [
        "price",
        "freight_value",
        "payment_value",
        "product_weight_g",
        "product_length_cm",
        "product_height_cm",
        "product_width_cm",
        "geolocation_lat",
        "geolocation_lng",
        "payment_installments",
        "review_score",
        "customer_zip_code_prefix",
        "seller_zip_code_prefix",
    ]

    for col in potential_outlier_cols:
        if col in numeric_cols:
            cols_to_modify_outlier.append(col)

    for col in cols_to_modify_outlier:
        num_outliers = int(outlier_ratio * len(df_copy))
        indices_to_modify = random.sample(df_copy.index.tolist(), num_outliers)

        for idx in indices_to_modify:
            original_dtype = df_copy[col].dtype

            if random.random() < 0.5:
                # Add over value
                over_value = df_copy[col].max() * (1 + random.random()) * 5
                if pd.api.types.is_integer_dtype(original_dtype):
                    df_copy.loc[idx, col] = int(over_value)
                else:
                    df_copy.loc[idx, col] = over_value
            else:
                # Add under value
                under_value = df_copy[col].min() * (random.random() * 0.1)
                if pd.api.types.is_integer_dtype(original_dtype):
                    df_copy.loc[idx, col] = int(under_value)
                else:
                    df_copy.loc[idx, col] = under_value

        print(f"Number of outliers added in column '{col}': {num_outliers}")

    return df_copy

## Function for remove random rows data

In [6]:
def remove_rows_randomly(df, num_rows_to_remove=None, percentage_to_remove=None):
    """
    Removes a specified number or percentage of rows randomly from a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame from which rows will be removed.
        num_rows_to_remove (int, optional): The exact number of rows to remove. Defaults to None.
        percentage_to_remove (float, optional): The percentage of rows to remove (0.0 - 1.0). Defaults to None.

    Returns:
        pd.DataFrame: A new DataFrame with the specified rows removed.
                      Returns the original DataFrame if neither num_rows_to_remove nor
                      percentage_to_remove is provided or if the input values are invalid.
    """
    df_copy = df.copy()
    total_rows = len(df_copy)
    rows_to_drop = []

    if num_rows_to_remove is not None:
        if isinstance(num_rows_to_remove, int) and 0 < num_rows_to_remove < total_rows:
            rows_to_drop = random.sample(range(total_rows), num_rows_to_remove)
        else:
            print(
                "Invalid value for num_rows_to_remove. Please provide a positive integer less than the total number of rows."
            )
            return df_copy
    elif percentage_to_remove is not None:
        if isinstance(percentage_to_remove, float) and 0.0 < percentage_to_remove < 1.0:
            num_rows_to_drop = int(percentage_to_remove * total_rows)
            rows_to_drop = random.sample(range(total_rows), num_rows_to_drop)
        else:
            print(
                "Invalid value for percentage_to_remove. Please provide a float between 0.0 and 1.0."
            )
            return df_copy
    else:
        print("Please provide either num_rows_to_remove or percentage_to_remove.")
        return df_copy

    if rows_to_drop:
        df_copy = df_copy.drop(rows_to_drop).reset_index(drop=True)
        print(f"Removed {len(rows_to_drop)} rows from the DataFrame.")

    return df_copy

## Load & save new dataset

In [7]:
customer_df = pd.read_csv("../data/customers_dataset.csv")
geolocation_df = pd.read_csv("../data/geolocation_dataset.csv")
order_item_df = pd.read_csv("../data/order_items_dataset.csv")
order_payments_df = pd.read_csv("../data/order_payments_dataset.csv")
order_reviews_df = pd.read_csv("../data/order_reviews_dataset.csv")
orders_df = pd.read_csv("../data/orders_dataset.csv")
product_category_name_df = pd.read_csv("../data/product_category.csv")
products_df = pd.read_csv("../data/products_dataset.csv")
seller_df = pd.read_csv("../data/sellers_dataset.csv")

In [8]:
customer_df_remove_lines = remove_rows_randomly(customer_df, percentage_to_remove=0.2)
geolocation_df_remove_lines = remove_rows_randomly(
    geolocation_df, percentage_to_remove=0.7
)
order_item_df_remove_lines = remove_rows_randomly(
    order_item_df, percentage_to_remove=0.4
)
order_payment_df_remove_lines = remove_rows_randomly(
    order_payments_df, percentage_to_remove=0.4
)
order_review_df_remove_lines = remove_rows_randomly(
    order_reviews_df, percentage_to_remove=0.4
)
orders_df_remove_lines = remove_rows_randomly(orders_df, percentage_to_remove=0.4)
products_df_remove_lines = remove_rows_randomly(products_df, percentage_to_remove=0.4)
seller_df_remove_lines = remove_rows_randomly(seller_df, percentage_to_remove=0.4)

Removed 19888 rows from the DataFrame.
Removed 700114 rows from the DataFrame.
Removed 45060 rows from the DataFrame.
Removed 41554 rows from the DataFrame.
Removed 39689 rows from the DataFrame.
Removed 39776 rows from the DataFrame.
Removed 13180 rows from the DataFrame.
Removed 1238 rows from the DataFrame.


In [9]:
customer_df_dirty = make_data_dirty(customer_df_remove_lines)
geolocation_df_dirty = make_data_dirty(geolocation_df_remove_lines)
order_item_df_dirty = make_data_dirty(order_item_df_remove_lines)
order_payment_df_dirty = make_data_dirty(order_payment_df_remove_lines)
order_review_df_dirty = make_data_dirty(order_review_df_remove_lines)
orders_df_dirty = make_data_dirty(orders_df_remove_lines)
product_category_name_translation_df_dirty = make_data_dirty(product_category_name_df)
products_df_dirty = make_data_dirty(products_df_remove_lines)
seller_df_dirty = make_data_dirty(seller_df_remove_lines)

Number of missing values (NaN) added (excluding ID columns): 14319
Number of outliers added in column 'customer_zip_code_prefix': 3977
Number of missing values (NaN) added (excluding ID columns): 90014
Number of outliers added in column 'geolocation_lat': 15002
Number of outliers added in column 'geolocation_lng': 15002
Number of missing values (NaN) added (excluding ID columns): 16221
Number of outliers added in column 'price': 3379
Number of outliers added in column 'freight_value': 3379
Number of missing values (NaN) added (excluding ID columns): 14959
Number of outliers added in column 'payment_value': 3116
Number of outliers added in column 'payment_installments': 3116
Number of missing values (NaN) added (excluding ID columns): 17860
Number of outliers added in column 'review_score': 2976
Number of missing values (NaN) added (excluding ID columns): 21479
Number of missing values (NaN) added (excluding ID columns): 8
Number of missing values (NaN) added (excluding ID columns): 949

In [10]:
# List berisi nama DataFrame yang udah dikotori
dirty_dfs = {
    "customer_data_dirty": customer_df_dirty,
    "geolocation_dirty": geolocation_df_dirty,
    "order_item_dirty": order_item_df_dirty,
    "order_payment_dirty": order_payment_df_dirty,
    "order_review_dirty": order_review_df_dirty,
    "orders_dataset_dirty": orders_df_dirty,
    "product_category": product_category_name_translation_df_dirty,
    "products_dirty": products_df_dirty,
    "seller_dirty": seller_df_dirty,
}

target_folder = "dataset"
# 2. Periksa apakah folder target sudah ada
if not os.path.exists(target_folder):
    # 3. Jika belum ada, buat foldernya
    try:
        os.makedirs(target_folder)
        print(f"Folder '{target_folder}' belum ada, berhasil dibuat.")
    except OSError as e:
        print(f"Gagal membuat folder '{target_folder}'. Error: {e}")
        # Anda mungkin ingin menghentikan script di sini jika folder gagal dibuat
        # exit()
else:
    print(f"Folder '{target_folder}' sudah ada.")

# 4. Lanjutkan loop untuk menyimpan DataFrame ke CSV
print("\nMemulai proses penyimpanan DataFrame...")
for name, df in dirty_dfs.items():
    # Gunakan os.path.join untuk membuat path file yang valid di berbagai OS
    filepath = os.path.join(target_folder, f"{name}.csv")

    try:
        df.to_csv(filepath, index=False)
        print(f"DataFrame '{name}' berhasil disimpan ke '{filepath}'")
    except Exception as e:
        # Menangani potensi error saat menyimpan file (misal, masalah izin)
        print(f"Gagal menyimpan DataFrame '{name}' ke '{filepath}'. Error: {e}")

print("\nProses penyimpanan selesai.")

Folder 'dataset' belum ada, berhasil dibuat.

Memulai proses penyimpanan DataFrame...
DataFrame 'customer_data_dirty' berhasil disimpan ke 'dataset/customer_data_dirty.csv'
DataFrame 'geolocation_dirty' berhasil disimpan ke 'dataset/geolocation_dirty.csv'
DataFrame 'order_item_dirty' berhasil disimpan ke 'dataset/order_item_dirty.csv'
DataFrame 'order_payment_dirty' berhasil disimpan ke 'dataset/order_payment_dirty.csv'
DataFrame 'order_review_dirty' berhasil disimpan ke 'dataset/order_review_dirty.csv'
DataFrame 'orders_dataset_dirty' berhasil disimpan ke 'dataset/orders_dataset_dirty.csv'
DataFrame 'product_category' berhasil disimpan ke 'dataset/product_category.csv'
DataFrame 'products_dirty' berhasil disimpan ke 'dataset/products_dirty.csv'
DataFrame 'seller_dirty' berhasil disimpan ke 'dataset/seller_dirty.csv'

Proses penyimpanan selesai.
