In [None]:
!pip install faker
!pip install uuid
!pip install random
!pip install datetime
!pip install os

[31mERROR: Could not find a version that satisfies the requirement random (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for random[0m[31m
[0mCollecting datetime
  Using cached DateTime-5.5-py3-none-any.whl.metadata (33 kB)
Collecting zope.interface (from datetime)
  Using cached zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
Downloading DateTime-5.5-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.8/259.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: zope.interface, datetime
Successfully installed datetime-5.5 zope.interface-7.2


In [None]:
import json
import os
from datetime import datetime, timedelta
import random
from faker import Faker
import uuid

# Initialize Faker for realistic data generation
fake = Faker()

# --- Configuration Parameters ---
OUTPUT_DIR = "daily_transactions"    # Directory to save the generated JSON files
NUM_DAYS = 20                        # Number of daily files to generate
MIN_TRANSACTIONS_PER_DAY = 100       # Minimum transactions per daily file
MAX_TRANSACTIONS_PER_DAY = 200       # Maximum transactions per daily file
NUM_CUSTOMER_POOL = 500              # Number of unique customers to pre-generate
NUM_PRODUCTS = 30                    # Number of unique products to define

# --- Pre-defined Lists for Categorical Data (for realism) ---
ACCOUNT_TYPES = ['Savings', 'Checking', 'Credit Card', 'Investment']
ACCOUNT_CATEGORIES = ['Personal', 'Business', 'Joint', 'Student']
PAYMENT_SOURCES = ['Credit Card', 'Debit Card', 'Bank Transfer', 'PayPal', 'Crypto', 'Gift Card']
ORDER_STATUSES = ['Completed', 'Pending', 'Shipped', 'Delivered', 'Cancelled', 'Processing']
CURRENCIES = ['USD', 'GBP', 'EUR', 'CAD', 'AUD']
GENDERS = ['Male', 'Female'] # Added GENDERS list , 'Non-binary', 'Prefer not to say'

# --- Static Product Data for Transactions ---
# This list is generated once to ensure consistent product IDs and prices
PRODUCT_LIST = []
for i in range(1, NUM_PRODUCTS + 1):
    PRODUCT_LIST.append({
        'ProductId': f'PROD{i:04d}',
        'ProductName': f'Product {i} - {fake.word().capitalize()} Widget',
        'ProductPrice': round(random.uniform(5.0, 500.0), 2) # Prices between $5 and $500
    })

# --- Pre-generate Customer and Account Pool ---
# This pool is created once at the start to simulate a stable customer base.
# Daily transactions will randomly pick from this pool, creating realistic incremental data
# where the same customers/accounts appear across different days.
CUSTOMER_POOL = []
for _ in range(NUM_CUSTOMER_POOL):
    customer_id = f'CUST_{str(uuid.uuid4())[:8].upper()}' # Unique Customer ID
    customer_data = {
        'CustomerId': customer_id,
        'Firstname': fake.first_name(),
        'Lastname': fake.last_name(),
        'DateOfBirth': fake.date_of_birth(minimum_age=18, maximum_age=80).strftime('%Y-%m-%d'),
        'Postcode': fake.postcode(),
        'Address': fake.address().replace('\n', ', '), # Ensure address is a single line
        'Email': fake.email(),
        'PhoneNumber': fake.phone_number(),
        'Gender': random.choice(GENDERS), # Dynamically assign gender
        'Accounts': [] # Each customer can have multiple accounts
    }

    num_accounts = random.randint(1, 3) # Randomly assign 1 to 3 accounts per customer
    for acc_idx in range(num_accounts):
        account_id = f'ACC_{str(uuid.uuid4())[:8].upper()}' # Unique Account ID
        customer_data['Accounts'].append({
            'AccountId': account_id,
            'AccountType': random.choice(ACCOUNT_TYPES),
            'AccountCategory': random.choice(ACCOUNT_CATEGORIES)
        })
    CUSTOMER_POOL.append(customer_data)

# --- Ensure Output Directory Exists ---
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory '{OUTPUT_DIR}' created or already exists.")

print(f"Generating data for {NUM_DAYS} days...")

# --- Main Data Generation Loop ---
# Start date is NUM_DAYS ago to simulate historical data generation
start_date = datetime.now() - timedelta(days=NUM_DAYS - 1)

for day_offset in range(NUM_DAYS):
    current_date = start_date + timedelta(days=day_offset)
    daily_transactions = []
    num_transactions = random.randint(MIN_TRANSACTIONS_PER_DAY, MAX_TRANSACTIONS_PER_DAY)

    print(f"   Generating {num_transactions} transactions for {current_date.strftime('%Y-%m-%d')}...")

    for i in range(num_transactions):
        # Randomly select a customer and one of their accounts from the pre-generated pool
        customer_record = random.choice(CUSTOMER_POOL)
        account_record = random.choice(customer_record['Accounts'])

        # Randomly select a product and quantity
        product_record = random.choice(PRODUCT_LIST)
        quantity = random.randint(1, 5)
        order_total_price = round(product_record['ProductPrice'] * quantity, 2)

        # Generate unique IDs for transaction-specific elements for the current day
        # Using a combination of date, transaction index, and a short UUID part for uniqueness
        order_id = f"ORD_{current_date.strftime('%Y%m%d')}_{i+1:05d}_{uuid.uuid4().hex[:4].upper()}"
        payment_id = f"PAY_{current_date.strftime('%Y%m%d')}_{i+1:05d}_{uuid.uuid4().hex[:4].upper()}"
        order_details_id = f"DET_{current_date.strftime('%Y%m%d')}_{i+1:05d}_{uuid.uuid4().hex[:4].upper()}"

        # Construct the single transaction row
        transaction_row = {
            # Customer Details
            'CustomerId': customer_record['CustomerId'],
            'Firstname': customer_record['Firstname'],
            'Lastname': customer_record['Lastname'],
            'DateOfBirth': customer_record['DateOfBirth'],
            'Postcode': customer_record['Postcode'],
            'Address': customer_record['Address'],
            'Email': customer_record['Email'],
            'PhoneNumber': customer_record['PhoneNumber'],
            'Gender': customer_record['Gender'], # Added Gender here

            # Account Details
            'AccountId': account_record['AccountId'],
            'AccountType': account_record['AccountType'],
            'AccountCategory': account_record['AccountCategory'],

            # Order Details
            'OrderId': order_id,
            'OrderDate': current_date.strftime('%Y-%m-%d'),
            'ProductId': product_record['ProductId'],
            'ProductName': product_record['ProductName'],
            'ProductPrice': product_record['ProductPrice'],
            'Quantity': quantity,
            'OrderTotalPrice': order_total_price,
            'OrderDetailsId': order_details_id, # Represents a unique line item for this order
            'OrderStatus': random.choice(ORDER_STATUSES),
            'Currency': random.choice(CURRENCIES),

            # Payment Details
            'PaymentId': payment_id,
            'PaymentDate': current_date.strftime('%Y-%m-%d'), # Payment happens on the same day as order
            'PaymentSource': random.choice(PAYMENT_SOURCES),

            # Device/Origin Information
            'LaptopIP_Address': fake.ipv4_public(), # Simulates the IP from which the transaction was made
            'TransactionTimestamp': current_date.strftime('%Y-%m-%d') + ' ' + fake.time(pattern='%H:%M:%S', end_datetime=None),

            # Additional related columns
            'ShippingAddress': fake.address().replace('\n', ', '), # Could be same as Customer Address or different
            'PaymentStatus': random.choice(['Paid', 'Failed', 'Refunded']),
            'CustomerSegment': random.choice(['New', 'Loyal', 'Churned', 'VIP']),
            'DeviceType': random.choice(['Mobile', 'Desktop', 'Tablet']),
            'ReferralSource': random.choice(['Organic Search', 'Social Media', 'Email Campaign', 'Direct', 'Referral Link', 'Paid Ad']),
            'UserAgent': fake.user_agent() # Simulate browser/device info
        }
        daily_transactions.append(transaction_row)

    # Save the daily transactions to a JSON file
    file_name = f"{current_date.strftime('%Y-%m-%d')}.json"
    file_path = os.path.join(OUTPUT_DIR, file_name)

    with open(file_path, 'w') as f:
        json.dump(daily_transactions, f, indent=4) # `indent=4` for pretty-printing JSON

    print(f"   Saved {len(daily_transactions)} transactions to {file_path}")

print(f"\nData generation complete. All files are located in the '{OUTPUT_DIR}' directory.")

Output directory 'daily_transactions' created or already exists.
Generating data for 20 days...
   Generating 189 transactions for 2025-06-03...
   Saved 189 transactions to daily_transactions/2025-06-03.json
   Generating 125 transactions for 2025-06-04...
   Saved 125 transactions to daily_transactions/2025-06-04.json
   Generating 139 transactions for 2025-06-05...
   Saved 139 transactions to daily_transactions/2025-06-05.json
   Generating 186 transactions for 2025-06-06...
   Saved 186 transactions to daily_transactions/2025-06-06.json
   Generating 155 transactions for 2025-06-07...
   Saved 155 transactions to daily_transactions/2025-06-07.json
   Generating 125 transactions for 2025-06-08...
   Saved 125 transactions to daily_transactions/2025-06-08.json
   Generating 147 transactions for 2025-06-09...
   Saved 147 transactions to daily_transactions/2025-06-09.json
   Generating 196 transactions for 2025-06-10...
   Saved 196 transactions to daily_transactions/2025-06-10.json


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
