In [7]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Define parameters with your modified categories
start_date = '2023-01-01'
end_date = '2025-12-31'
categories = {
    'In': ['Salary'],
    'Out': ['Grocery', 'Utilities', 'Transport', 'Entertainment', 'Shopping', 'Others']
}

# Generate date range
date_range = pd.date_range(start=start_date, end=end_date)

# Initialize empty lists for data
dates = []
flows = []
amounts = []
category_list = []

# Generate salary payments (monthly)
salary_dates = pd.date_range(start=start_date, end=end_date, freq='MS')  # MS = month start
for date in salary_dates:
    dates.append(date)
    flows.append('In')
    amounts.append(round(np.random.normal(15545, 500), 2))  # Similar to your sample amount
    category_list.append('Salary')

# Generate expenses
for date in date_range:
    # Utilities (monthly, around 15th)
    if date.day == 15:
        dates.append(date)
        flows.append('Out')
        amounts.append(round(np.random.normal(300, 50), 2))
        category_list.append('Utilities')
    
    # Groceries (1-3 times per week)
    if np.random.random() < 0.3:
        dates.append(date)
        flows.append('Out')
        amounts.append(round(np.random.normal(150, 30), 2))
        category_list.append('Grocery')
    
    # Other expenses (random)
    if np.random.random() < 0.4:  # 40% chance of some expense on any given day
        dates.append(date)
        flows.append('Out')
        amount = round(np.random.exponential(75), 2)
        amount = min(amount, 1000)  # Cap large outliers
        amounts.append(amount)
        # Choose from remaining Out categories excluding Utilities and Grocery
        other_categories = [c for c in categories['Out'] if c not in ['Utilities', 'Grocery']]
        category_list.append(np.random.choice(other_categories))

# Create DataFrame
df = pd.DataFrame({
    'Date': dates,
    'Flow': flows,
    'Amount': amounts,
    'Category': category_list
})

# Sort by date
df = df.sort_values('Date').reset_index(drop=True)

# Format date as in the sample (m/d/yyyy)
df['Date'] = df['Date'].dt.strftime('%-m/%-d/%Y')
df['Date'] = pd.to_datetime(df['Date']).dt.date

# Display sample
print("Generated dataset shape:", df.shape)
df.head(10)

df.to_csv(r'raw\sample-dataset.csv',index=False)

Generated dataset shape: (812, 4)
