# Setup

In [None]:
import numpy as np
import pandas as pd
import os

customers = pd.read_csv("/kaggle/input/etl-csvs/customers.csv")
products_catalog = pd.read_csv("/kaggle/input/etl-csvs/products_catalog.csv")
transactions_log = pd.read_csv("/kaggle/input/etl-csvs/transactions_log.csv")
print("Loaded dataframes from CSVs")

print(f"""
* Before Merging *
customers.csv shape: {customers.shape[0]} × {customers.shape[1]}
products_catalog.csv shape: {products_catalog.shape[0]} × {products_catalog.shape[1]}
transactions_log.csv shape: {transactions_log.shape[0]} × {transactions_log.shape[1]}
""")

# Lightweight type casting

In [None]:
transactions_log.rename(columns={'Date': 'Timestamp'}, inplace=True)

customers['CustomerID'] = customers['CustomerID'].astype(str)
customers['Business_Category'] = customers['Business_Category'].astype(str)
customers['Business_Size'] = customers['Business_Size'].astype(str)
customers['Customer_Since'] = pd.to_datetime(customers['Customer_Since'])

products_catalog['SKU'] = products_catalog['SKU'].astype(str)
products_catalog['Rev_GL_Class'] = products_catalog['Rev_GL_Class'].astype(str)
products_catalog['Sub_Category'] = products_catalog['Sub_Category'].astype(str)
products_catalog['Item_Description'] = products_catalog['Item_Description'].astype(str)
products_catalog['Brand'] = products_catalog['Brand'].astype(str)
products_catalog['Unit_Price'] = pd.to_numeric(products_catalog['Unit_Price'], errors='coerce')
products_catalog['Attributes'] = products_catalog['Attributes'].astype(str)

transactions_log['TransactionID'] = transactions_log['TransactionID'].astype(str)
transactions_log['CustomerID'] = transactions_log['CustomerID'].astype(str)
transactions_log['Timestamp'] = pd.to_datetime(transactions_log['Timestamp'])
transactions_log['SKU'] = transactions_log['SKU'].astype(str)
transactions_log['Quantity'] = pd.to_numeric(transactions_log['Quantity'], downcast='integer', errors='coerce')

print("Lightweight type casting completed")

# Merging

In [None]:
if 'CustomerID' in transactions_log.columns and 'CustomerID' in customers.columns:
    merged_df = pd.merge(transactions_log, customers, on='CustomerID', how='left')
    if 'SKU' in merged_df.columns and 'SKU' in products_catalog.columns:
        merged_df = pd.merge(merged_df, products_catalog, on='SKU', how='left')
        merged_df = merged_df.sort_values(by=['CustomerID', 'Timestamp', 'SKU'])
        print(f"""
* After Merging *
merged dataframe shape: {merged_df.shape[0]} × {merged_df.shape[1]}
""")

    else:
        print("Cannot merge merged_df and products_catalog: 'SKU' missing")

else:
    print("Cannot merge transactions and customers: 'CustomerID' missing")




# Saving

In [None]:
mem_MB = merged_df.memory_usage(deep=True).sum() / 1_048_576  # bytes → MB
print(f"Estimated in-memory size: {mem_MB:.2f} MB")

output_path = os.path.join("/kaggle", "working", "merged_transactions")

if mem_MB < 1000:  # Less than ~1 GB
    merged_df.to_csv(output_path + ".csv", index=False)
    print("Saved dataset as CSV")
else:
    merged_df.to_parquet(output_path + ".parquet", index=False)
    print("Saved dataset as Parquet")

# Verifying

In [None]:
# CustomerID check
missing_customers = transactions_log.loc[~transactions_log['CustomerID'].isin(customers['CustomerID'])]
try:
    assert missing_customers.empty, f"❌ Found {len(missing_customers)} transactions with missing CustomerIDs"

    print("All CustomerIDs in transactions exist in customers table")

except AssertionError as e:
    print(e)

    total_rows = len(transactions_log)
    missing_rows = len(missing_customers)
    percent_missing = (missing_rows / total_rows) * 100

    if percent_missing < 2:
        print(f"Removing {missing_rows} rows with missing CustomerIDs ({percent_missing:.2f}%)")
        transactions_log = transactions_log[transactions_log['CustomerID'].isin(customers['CustomerID'])]
    else:
        print(f"More than 2% of rows ({percent_missing:.2f}%) have missing CustomerIDs. Manual review recommended")

# SKU check
missing_skus = transactions_log.loc[~transactions_log['SKU'].isin(products_catalog['SKU'])]
try:
    assert missing_skus.empty, f"Found {len(missing_skus)} transactions with missing SKUs."

    print("All SKUs in transactions exist in products_catalog table")

except AssertionError as e:
    print(e)

    total_rows = len(transactions_log)
    missing_rows = len(missing_skus)
    percent_missing = (missing_rows / total_rows) * 100

    if percent_missing < 2:
        print(f"Removing {missing_rows} rows with missing SKUs ({percent_missing:.2f}%)")
        transactions_log = transactions_log[transactions_log['SKU'].isin(products_catalog['SKU'])]
    else:
        print(f"More than 2% of rows ({percent_missing:.2f}%) have missing SKUs. Manual review recommended")
