In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [2]:
# Parameters
num_customers = 1000
num_products = 50
num_services = 20
start_date = datetime(2024, 1, 1)
end_date = datetime(2034, 1, 1)
date_range = pd.date_range(start_date, end_date, freq='MS')

In [3]:
# Generate initial data
customers = [f'CUST{i:03d}' for i in range(1, num_customers + 1)]
products = [f'PROD{i:03d}' for i in range(1, num_products + 1)]
services = [f'SERV{i:03d}' for i in range(1, num_services + 1)]
parent_ids = [f'PARENT{i:03d}' for i in range(1, num_customers // 2 + 1)]

In [4]:
# Create a DataFrame to hold the data
data = []

In [5]:
# Simulate data for each month
for date in date_range:
    for customer in customers:
        parent_id = np.random.choice(parent_ids)
        service_id = np.random.choice(services)
        for product in products:
            if np.random.rand() > 0.5:  # Randomly decide if the customer has this product
                volume = np.random.randint(1, 20)
                charge = np.random.randint(50, 200) * (1 + (date.year - start_date.year) * 0.05)  # Increase price over time
                data.append([date, parent_id, customer, service_id, product, f'Product {product[-3:]}', volume, f'${charge:.2f}'])

In [6]:
# Convert to DataFrame
df = pd.DataFrame(data, columns=['Billing Date', 'Parent ID', 'Customer ID', 'Service ID', 'Product ID', 'Product Description', 'Volume', 'Charge'])

In [7]:
# Simulate customer behavior
# Loyal customers
loyal_customers = np.random.choice(customers, size=int(num_customers * 0.2), replace=False)
# Customers who stop being billed
stopped_customers = np.random.choice(customers, size=int(num_customers * 0.1), replace=False)
# Customers who merge under a different parent ID
merged_customers = np.random.choice(customers, size=int(num_customers * 0.1), replace=False)

In [8]:
# Apply behavior to DataFrame
df = df[~df['Customer ID'].isin(stopped_customers)]
for customer in merged_customers:
    new_parent_id = np.random.choice(parent_ids)
    df.loc[df['Customer ID'] == customer, 'Parent ID'] = new_parent_id

In [9]:
# Simulate product lifecycle
# Products that get sunsetted
sunsetted_products = np.random.choice(products, size=int(num_products * 0.2), replace=False)
# New products that are created
new_products = [f'PROD{i:03d}' for i in range(num_products + 1, num_products + 11)]

In [11]:
# Apply product lifecycle to DataFrame
df = df[~df['Product ID'].isin(sunsetted_products)]
for product in new_products:
    for date in date_range:
        for customer in customers:
            if np.random.rand() > 0.5:
                volume = np.random.randint(1, 20)
                charge = np.random.randint(50, 200) * (1 + (date.year - start_date.year) * 0.05)
                new_row = pd.DataFrame({'Billing Date': [date], 
                                        'Parent ID': [np.random.choice(parent_ids)], 
                                        'Customer ID': [customer], 
                                        'Service ID': [np.random.choice(services)], 
                                        'Product ID': [product], 
                                        'Product Description': [f'Product {product[-3:]}'], 
                                        'Volume': [volume], 
                                        'Charge': [f'${charge:.2f}']})
                df = pd.concat([df, new_row], ignore_index=True)

In [None]:
# Ensure we have 100,000 rows
while len(df) < 100000:
    date = np.random.choice(date_range)
    customer = np.random.choice(customers)
    parent_id = np.random.choice(parent_ids)
    service_id = np.random.choice(services)
    product = np.random.choice(products + new_products)
    volume = np.random.randint(1, 20)
    charge = np.random.randint(50, 200) * (1 + (date.year - start_date.year) * 0.05)
    new_row = pd.DataFrame({'Billing Date': [date], 
                            'Parent ID': [parent_id], 
                            'Customer ID': [customer], 
                            'Service ID': [service_id], 
                            'Product ID': [product], 
                            'Product Description': [f'Product {product[-3:]}'], 
                            'Volume': [volume], 
                            'Charge': [f'${charge:.2f}']})
    df = pd.concat([df, new_row], ignore_index=True)

In [None]:
# Save to CSV
df.to_csv('dummy_data.csv', index=False)