In [0]:
# # Delete streaming data in Bronze container
# dbutils.fs.rm("/mnt/realtimedeai/bronze/streaming/", recurse=True)

# # Delete transformed streaming data from Silver container
# dbutils.fs.rm("/mnt/realtimedeai/silver/streaming/", recurse=True)


In [0]:
dbutils.fs.mkdirs("/mnt/realtimedeai/bronze/streaming/")


In [0]:
%pip install faker 

In [0]:
# --------------------------------------
# ✅ Required Libraries
# --------------------------------------
import time
import pandas as pd
from faker import Faker
import random

# --------------------------------------
# ✅ Initialize Faker
# --------------------------------------
fake = Faker()

# --------------------------------------
# ✅ Load Batch Product & Store Data
# --------------------------------------
products_df = spark.read.parquet("/mnt/realtimedeai/bronze/batch/products/").toPandas()
stores_df = spark.read.parquet("/mnt/realtimedeai/bronze/batch/stores/").toPandas()

# Prepare lists from batch data (✔ aligned)
product_id_list = products_df['product_id'].tolist()
product_price_map = products_df.set_index('product_id')['base_price'].to_dict()
store_id_list = stores_df['store_id'].tolist()

# --------------------------------------
# ✅ Define Correct Streaming Data Generation
# --------------------------------------
def generate_streaming_sales(batch_size=5):
    data = []
    for _ in range(batch_size):
        product_id = random.choice(product_id_list)
        base_price = product_price_map[product_id]
        store_id = random.choice(store_id_list)
        quantity_sold = random.randint(1, 5)
        sale_amount = round(base_price * quantity_sold, 2)

        record = {
            "transaction_id": fake.uuid4(),
            "product_id": product_id,
            "store_id": store_id,
            "quantity_sold": quantity_sold,
            "sale_amount": sale_amount,
            "transaction_time": fake.iso8601()
        }
        data.append(record)
    return pd.DataFrame(data)

# ✅ Preview one batch (for verification)
display(generate_streaming_sales())

# --------------------------------------
# ✅ Define Paths Correctly
# --------------------------------------
streaming_path = "/mnt/realtimedeai/bronze/streaming/"
local_path_prefix = "/dbfs"

# # Ensure directory exists
dbutils.fs.mkdirs(streaming_path)

# # --------------------------------------
# # ✅ Write Streaming Batches (Simulation)
# # --------------------------------------
num_batches = 10               # Adjust if needed
batch_size = 5                 # Adjust if needed

for batch_num in range(num_batches):
    df = generate_streaming_sales(batch_size=batch_size)
    file_path = f"{local_path_prefix}{streaming_path}/sales_stream_batch_{batch_num}.parquet"
    df.to_parquet(file_path, index=False)
    print(f"✅ Batch {batch_num + 1}/{num_batches} written to: {file_path}")
    time.sleep(5)  # Simulate streaming delay

print("🚀 Streaming data simulation completed successfully (fully aligned with batch product and store IDs)!")
