In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [3]:

np.random.seed(42)

num_transactions = 100000
start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 9, 30)

product_types = ['Renters', 'Homeowners', 'Flood']
product_probs = [0.80, 0.15, 0.05]

# Northeast, Midwest, South, West
region_probs = {
    'Renters':   [0.19, 0.18, 0.37, 0.26],  
    'Homeowners':[0.17, 0.21, 0.39, 0.23],
    'Flood':     [0.15, 0.10, 0.65, 0.10]
}
regions = ['Northeast', 'Midwest', 'South', 'West']

amount_ranges = {
    'Renters': (12, 25),
    'Homeowners': (150, 225),
    'Flood': (35, 300)
}

transactions = []

delta_days = (end_date - start_date).days

for i in range(1, num_transactions + 1):
    product = np.random.choice(product_types, p=product_probs)
    region = np.random.choice(regions, p=region_probs[product])
    low, high = amount_ranges[product]
    amount = round(np.random.uniform(low, high), 2)
    date = start_date + timedelta(days=np.random.randint(0, delta_days + 1))
    
    transactions.append({
        'transaction_id': i,
        'product_type': product,
        'region': region,
        'amount': amount,
        'transaction_date': date
    })

df = pd.DataFrame(transactions)

print("Overall product distribution:")
print(df['product_type'].value_counts(normalize=True))

print("\nRegional distribution by product:")
print(df.groupby('product_type')['region'].value_counts(normalize=True).unstack().round(2))

Overall product distribution:
product_type
Renters       0.79911
Homeowners    0.14987
Flood         0.05102
Name: proportion, dtype: float64

Regional distribution by product:
region        Midwest  Northeast  South  West
product_type                                 
Flood            0.11       0.15   0.65  0.10
Homeowners       0.21       0.17   0.39  0.23
Renters          0.18       0.19   0.37  0.26


In [5]:
df.head(10)

Unnamed: 0,transaction_id,product_type,region,amount,transaction_date
0,1,Renters,West,21.52,2024-01-21
1,2,Renters,Northeast,12.76,2024-03-28
2,3,Renters,Northeast,20.46,2024-11-04
3,4,Flood,South,91.27,2024-07-10
4,5,Flood,South,197.09,2024-09-09
5,6,Renters,Midwest,19.95,2025-04-20
6,7,Flood,Midwest,59.01,2025-07-16
7,8,Renters,West,18.07,2025-04-29
8,9,Renters,Northeast,12.85,2025-01-22
9,10,Homeowners,South,178.91,2024-12-11


In [6]:
path = "C:/Users/tokud/Projects/Insurance Linear Regression/transactions.csv"

df.to_csv(path, index=False)