In [0]:
%pip install dbldatagen

Python interpreter will be restarted.
Collecting dbldatagen
  Downloading dbldatagen-0.4.0.post1-py3-none-any.whl (122 kB)
Installing collected packages: dbldatagen
Successfully installed dbldatagen-0.4.0.post1
Python interpreter will be restarted.


In [0]:
%pip install faker

Python interpreter will be restarted.
Collecting faker
  Downloading Faker-26.0.0-py3-none-any.whl (1.8 MB)
Installing collected packages: faker
Successfully installed faker-26.0.0
Python interpreter will be restarted.


In [0]:
dbutils.fs.rm("dbfs:/FileStore/capstone/txn_tbl/checkpoint_location/", recurse=True)

Out[1]: True

In [0]:
dbutils.fs.rm("dbfs:/FileStore/capstone/txn_tbl/latest/", recurse=True)

Out[2]: True

In [0]:
%sql
drop table if exists bronze.txn_tbl

In [0]:
#create schema before running and also delete this location before dem

dbutils.fs.rm("dbfs:/user/hive/warehouse/bronze.db/txn_tbl" ,recurse=True)

Out[4]: True

In [0]:
import pandas as pd
import numpy as np
import faker
import random
from datetime import datetime, timedelta
from sklearn.datasets import make_regression
import dbldatagen as dg
from pyspark.sql import SparkSession
from time import sleep
 
class DataGenerator:
    def __init__(self, num_customers=1000, num_branches=10, num_transactions=5000):
        self.fake = faker.Faker()
        self.num_customers = num_customers
        self.num_branches = num_branches
        self.num_transactions = num_transactions
        self.spark = SparkSession.builder.appName("DataGeneration").getOrCreate()
 
    def format_phone_number(self):
        phone_number = self.fake.phone_number()
        digits = ''.join(filter(str.isdigit, phone_number))
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:10]}"
 
    def random_date(self, start_date, end_date):
        start_dt = datetime.strptime(start_date, "%Y-%m-%d")
        end_dt = datetime.strptime(end_date, "%Y-%m-%d")
        return start_dt + timedelta(days=random.randint(0, (end_dt - start_dt).days))
 
    def generate_customers(self):
        names = [self.fake.name() for _ in range(self.num_customers)]
        first_names = [name.split()[0] for name in names]
        last_names = [name.split()[-1] for name in names]
        addresses = [self.fake.address().replace('\n', ', ') for _ in range(self.num_customers)]
        email_providers = ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "aol.com"]
        emails = [f"{first.lower()}.{last.lower()}@{random.choice(email_providers)}" for first, last in zip(first_names, last_names)]
        phone_numbers = [self.format_phone_number() for _ in range(self.num_customers)]
        join_dates = sorted([self.random_date("2019-01-01", "2024-07-19").strftime("%Y-%m-%d") for _ in range(self.num_customers)])
        last_updates = [self.random_date("2024-07-20", "2024-07-31").strftime("%Y-%m-%d %H:%M:%S") for _ in range(self.num_customers)]
        credit_scores = [random.randint(500, 850) for _ in range(self.num_customers)]
        customer_ids = [f"C{index+1000:04d}" for index in range(self.num_customers)]
 
        customers_df = pd.DataFrame({
            "customer_id": customer_ids,
            "name": names,
            "email": emails,
            "phone": phone_numbers,
            "address": addresses,
            "credit_score": credit_scores,
            "join_date": join_dates,
            "last_update": last_updates
        })
 
        return self.spark.createDataFrame(customers_df)
 
    def generate_branches(self):
        branch_ids = [f"B{index:04d}" for index in range(self.num_branches)]
        branch_names = random.choices(["Downtown Branch", "Central Branch", "North Branch", "East Branch", "West Branch"], k=self.num_branches)
        cities = [self.fake.city() for _ in range(100)]
        branch_locations = random.choices(cities, k=self.num_branches)
        branch_timezones = random.choices(["EST", "GMT", "PST", "AEST"], k=self.num_branches)
 
        branches_df = pd.DataFrame({
            "branch_id": branch_ids,
            "name": branch_names,
            "location": branch_locations,
            "timezone": branch_timezones
        })
 
        return self.spark.createDataFrame(branches_df)
 
    def generate_transactions(self, customer_ids, branch_ids):
        transaction_ids = [f"T{index:04d}" for index in range(5000, 10000)]
        X, y = make_regression(n_samples=self.num_transactions, n_features=1, noise=10)
        amounts = np.clip(np.abs(y), 1, 10000)
 
        num_outliers = int(0.05 * len(amounts))
        outliers_indices = np.random.choice(len(amounts), num_outliers, replace=False)
        amounts[outliers_indices] = np.random.uniform(10000, 100000, size=num_outliers)
        amounts = np.round(amounts, 2)
 
        status_values = ["completed"] * (int(0.9 * self.num_transactions)) + ["pending"] * int(0.1 * self.num_transactions)
        random.shuffle(status_values)
 
        random_intervals = np.random.randint(0, 720, size=self.num_transactions)
        cumulative_intervals = np.cumsum(random_intervals)
        start_date = pd.Timestamp("2023-01-01")
        random_timestamps = [start_date + pd.Timedelta(minutes=int(m)) for m in cumulative_intervals]
 
 
        transactions_df = pd.DataFrame({
            "transaction_id": transaction_ids,
            "customer_id": random.choices(customer_ids, k=self.num_transactions),
            "branch_id": random.choices(branch_ids, k=self.num_transactions),
            "channel": random.choices(["ATM", "web", "mobile", "branch"], k=self.num_transactions),
            "transaction_type": random.choices(["withdrawal", "deposit", "transfer", "payment"], k=self.num_transactions),
            "amount": amounts,
            "currency": random.choices(["USD", "EUR", "GBP"], k=self.num_transactions),
            "timestamp": random_timestamps,
            "status": status_values
        })
 
        return self.spark.createDataFrame(transactions_df)
 
# Usage
data_gen = DataGenerator()
 
customers_spark_df = data_gen.generate_customers()
branches_spark_df = data_gen.generate_branches()
transactions_spark_df = data_gen.generate_transactions(customers_spark_df.select("customer_id").rdd.flatMap(lambda x: x).collect(),
                                                       branches_spark_df.select("branch_id").rdd.flatMap(lambda x: x).collect())
 




In [0]:
customers_spark_df.write.format("delta").mode('overwrite').save("dbfs:/FileStore/capstone/cust_tbl/")
branches_spark_df.write.format("delta").mode('overwrite').save("dbfs:/FileStore/capstone/branches_tbl")

#####Generating dummy data with anamolies

In [0]:

import time

transactions_pandas_df = transactions_spark_df.orderBy("transaction_id").toPandas()

# Split into 5 chunks 
chunks = np.array_split(transactions_pandas_df, 5)

for chunk in chunks:
    # Convert Pandas DataFrame back to Spark DataFrame
    chunk_spark_df = spark.createDataFrame(chunk)
    
    # Write the chunk to Delta table
    chunk_spark_df.write.format("delta").mode("append").save("dbfs:/FileStore/capstone/txn_tbl/latest/")
    print(f"Wrote {len(chunk)} records. Waiting 10 seconds...")
    
    # Sleep for 10 seconds
    time.sleep(10)

print("Completed writing txn data")


Wrote 1000 records. Waiting 10 seconds...
Wrote 1000 records. Waiting 10 seconds...
Wrote 1000 records. Waiting 10 seconds...
Wrote 1000 records. Waiting 10 seconds...
Wrote 1000 records. Waiting 10 seconds...
Completed writing txn data
