In [0]:
# # Delete Products data from Bronze and Silver (safe!)
# dbutils.fs.rm("/mnt/realtimedeai/bronze/batch/products/", recurse=True)
# dbutils.fs.rm("/mnt/realtimedeai/silver/batch/products/", recurse=True)

# print("✅ Deleted existing Products data from both Bronze and Silver layers!")


In [0]:
%pip install faker

In [0]:
# ----------------------------------------
# 🚀 Data Simulation: Batch Data Generator
# Author: Savan's Real-Time AI Data Project
# Phase: Data Simulation (Batch)
# ----------------------------------------

# Step 1: Import required libraries
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

# Step 2: Initialize Faker for generating fake but realistic data
fake = Faker()

# Step 3: Set seeds for reproducibility (important for testing and re-runs)
Faker.seed(42)
random.seed(42)
np.random.seed(42)


In [0]:
# ----------------------------------------
# ✅ Step 3: Generate Products Data (Fixed & Clean)
# ----------------------------------------

import pandas as pd
import numpy as np
import random
from faker import Faker

# Initialize Faker
fake = Faker()

# Set seeds for reproducibility (important!)
Faker.seed(42)
random.seed(42)
np.random.seed(42)

# Define your category → product mapping (REALISTIC!)
category_product_map = {
    "Electronics": ["Laptop", "Smartphone", "Tablet", "Camera", "Smartwatch"],
    "Clothing": ["T-shirt", "Jeans", "Jacket", "Sneakers", "Sweater"],
    "Grocery": ["Rice", "Pasta", "Olive Oil", "Bread", "Eggs"],
    "Beverages": ["Coffee", "Tea", "Juice", "Soda", "Energy Drink"],
    "Household": ["Detergent", "Soap", "Toilet Paper", "Towel", "Dishwasher Tablets"]
}

# Deciding the number of products (example: 100)
num_products = 100

# First, create product_id, base_price, stock (DO NOT CHANGE THIS PART)
products_df = pd.DataFrame({
    'product_id': range(101, 101 + num_products),
    'base_price': np.round(np.random.uniform(2.0, 500.0, num_products), 2),
    'stock': np.random.randint(50, 1000, num_products)
})

# ✅ FIX: Assign proper category and product_name based on mapping
def assign_category_and_product():
    category = random.choice(list(category_product_map.keys()))
    product_name = random.choice(category_product_map[category])
    return pd.Series([category, product_name])

# Apply the function to replace category and product_name correctly
products_df[["category", "product_name"]] = products_df.apply(lambda _: assign_category_and_product(), axis=1)

# ✅ Final Check: Preview the fixed products data
display(products_df)


In [0]:
# ----------------------------------------
# ✅ Step 4: Generate Stores Data
# ----------------------------------------

# Deciding the number of stores (realistic: 10-50 stores)
num_stores = 20

# Define regions and store sizes (will be useful later for aggregations)
regions = ['Ontario', 'Quebec', 'British Columbia', 'Alberta']
store_sizes = ['Small', 'Medium', 'Large']

# Generate store catalog
stores_df = pd.DataFrame({
    'store_id': range(1, 1 + num_stores),
    'store_name': [fake.company() for _ in range(num_stores)],
    'region': np.random.choice(regions, num_stores),
    'city': [fake.city() for _ in range(num_stores)],
    'size': np.random.choice(store_sizes, num_stores)
})

# Check the output
display(stores_df)


In [0]:
# ----------------------------------------
# ✅ Step 5: Generate Promotions Data
# ----------------------------------------

# Number of promotions (let’s say about 30 promos for testing)
num_promotions = 30

# Promotion dates (to be used for valid discount ranges)
start_promo_date = datetime(2025, 4, 1)
end_promo_date = datetime(2025, 4, 30)

# Generate promotions data
promotions_df = pd.DataFrame({
    'promo_id': range(201, 201 + num_promotions),
    'product_id': np.random.choice(products_df['product_id'], num_promotions),  # linking to products table
    'discount_percent': np.random.choice([5, 10, 15, 20, 25], num_promotions),
    'start_date': [start_promo_date.strftime('%Y-%m-%d')] * num_promotions,
    'end_date': [end_promo_date.strftime('%Y-%m-%d')] * num_promotions
})

# Check the output
display(promotions_df)


In [0]:
# ----------------------------------------
# ✅ Step 6: Generate Sales Transactions (Batch)
# ----------------------------------------

# Decide number of sales transactions (let's simulate 2000 realistic rows)
num_sales = 2000

# Function to generate realistic sales transactions
def generate_sales_data(n=num_sales):
    sales_data = []
    
    for transaction_id in range(50001, 50001 + n):
        product = products_df.sample(1).iloc[0]  # Random product
        store = stores_df.sample(1).iloc[0]      # Random store
        quantity = np.random.randint(1, 10)      # Quantity between 1 and 10

        # No join with promotions here — raw price will be base_price.
        price = product['base_price']
        total_amount = round(price * quantity, 2)
        timestamp = fake.date_time_between(start_date='-10d', end_date='now')
        
        sales_data.append([
            transaction_id,
            product['product_id'],
            store['store_id'],
            quantity,
            round(price, 2),
            total_amount,
            timestamp.strftime('%Y-%m-%d %H:%M:%S')
        ])
    
    sales_df = pd.DataFrame(sales_data, columns=[
        'transaction_id', 'product_id', 'store_id',
        'quantity', 'price', 'total_amount', 'timestamp'
    ])
    
    return sales_df

# Generate the data
sales_transactions_df = generate_sales_data()

# -----------------------------
# Injecting Dirty Data (Very Important)
# -----------------------------

# Bad data: missing product_id, negative quantity/price, duplicate transaction_id
bad_data = pd.DataFrame([
    [99999, None, 3, -5, 20.0, -100.0, datetime.now().strftime('%Y-%m-%d %H:%M:%S')],  # Missing product_id, negative qty
    [99998, 102, None, 2, -50.0, 100.0, datetime.now().strftime('%Y-%m-%d %H:%M:%S')],  # Missing store_id, negative price
    [50001, 101, 1, 3, 15.0, 45.0, datetime.now().strftime('%Y-%m-%d %H:%M:%S')]        # Duplicate transaction_id (intentional!)
], columns=sales_transactions_df.columns)

# Combine clean + bad data
sales_transactions_df = pd.concat([sales_transactions_df, bad_data], ignore_index=True)

# Preview the data
sales_transactions_df.head(10)


In [0]:
# Convert Pandas DataFrame to Spark DataFrame
products_sdf = spark.createDataFrame(products_df)
stores_sdf = spark.createDataFrame(stores_df)
promotions_sdf = spark.createDataFrame(promotions_df)
sales_transactions_sdf = spark.createDataFrame(sales_transactions_df)

# Define the mount path
bronze_path = "/mnt/realtimedeai/bronze/batch/"

# Create the directory (optional safety)
dbutils.fs.mkdirs(bronze_path)

# ✅ Write as Parquet (Spark handles DBFS paths directly)
products_sdf.write.mode("overwrite").parquet(bronze_path + "products/")
stores_sdf.write.mode("overwrite").parquet(bronze_path + "stores/")
promotions_sdf.write.mode("overwrite").parquet(bronze_path + "promotions/")
sales_transactions_sdf.write.mode("overwrite").parquet(bronze_path + "sales_transactions/")

print("✅ All batch data successfully written to Bronze container in Parquet format (using Spark)!")


In [0]:
display(dbutils.fs.ls("/mnt/realtimedeai/bronze/batch/"))
