In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import os
import pickle

fake = Faker()

In [2]:
# === CONFIGURATION ===
TARGET_FINAL_SIZE_GB = 10
BYTES_PER_GB = 1024 ** 3

# Estimated row size per table (bytes)
ESTIMATED_ROW_SIZES = {
    'persons': 200,
    'applications': 300,
    'loans': 300,
    'vwlob': 150,
    'wof': 150,
    'transactions': 200,
    'transactions_p2p': 200,
    'credit_info': 300,
    'income': 300
}

# Base counts to preserve proportions
BASE_ROW_COUNTS = {
    'persons': 300_000,
    'applications': 5_000_000,
    'loans': 3_000_000,
    'vwlob': 250_000_000,
    'wof': 100_000,
    'transactions': 100_000_000,
    'transactions_p2p': 50_000_000,
    'credit_info': 8_000_000,
    'income': 3_000_000
}

# Estimate total size of base dataset
estimated_total_bytes = sum(BASE_ROW_COUNTS[table] * ESTIMATED_ROW_SIZES[table] for table in BASE_ROW_COUNTS)
scaling_factor = (TARGET_FINAL_SIZE_GB * BYTES_PER_GB) / estimated_total_bytes

# Scale all tables proportionally
TARGET_ROW_COUNTS = {
    table: max(1, int(BASE_ROW_COUNTS[table] * scaling_factor))
    for table in BASE_ROW_COUNTS
}

print(f"Estimated row counts to meet size goal (~{TARGET_FINAL_SIZE_GB}GB):")
for table, count in TARGET_ROW_COUNTS.items():
    print(f"- {table}: {count:,} rows")

Estimated row counts to meet size goal (~10GB):
- persons: 43,960 rows
- applications: 732,679 rows
- loans: 439,607 rows
- vwlob: 36,633,975 rows
- wof: 14,653 rows
- transactions: 14,653,590 rows
- transactions_p2p: 7,326,795 rows
- credit_info: 1,172,287 rows
- income: 439,607 rows


In [None]:
# === OUTPUT FOLDER ===
os.makedirs("./synthetic_data", exist_ok=True)

# === GENERATION FUNCTIONS ===
def generate_persons(n):
    return pd.DataFrame({
        'personid': range(1, n + 1),
        'first_name': [fake.first_name() for _ in range(n)],
        'last_name': [fake.last_name() for _ in range(n)],
        'address': [fake.address().replace("\n", ", ") for _ in range(n)],
        'birth_date': [fake.date_of_birth(minimum_age=18, maximum_age=70) for _ in range(n)],
        'gender_id': np.random.choice([1, 2], size=n),
        'maritalstatus_id': np.random.choice([1, 2, 3], size=n),
    })

def generate_applications(n, person_ids):
    return pd.DataFrame({
        'appid': range(1, n + 1),
        'personid': np.random.choice(person_ids, size=n),
        'productid': np.random.randint(1, 20, size=n),
        'amount': np.round(np.random.uniform(500, 10000, size=n), 2),
        'status_id': np.random.choice([1, 2, 3, 4], size=n),
        'register_date': [fake.date_between(start_date='-3y', end_date='today') for _ in range(n)],
    })

def generate_loans(n, applications):
    selected_apps = applications.sample(n)
    return pd.DataFrame({
        'loanid': range(1, n + 1),
        'appid': selected_apps['appid'].values,
        'personid': selected_apps['personid'].values,
        'loan_size': selected_apps['amount'].values,
        'loan_period': np.random.randint(6, 60, size=n),
        'loan_percent': np.round(np.random.uniform(5, 25, size=n), 2),
        'loan_value_date': [fake.date_between(start_date='-2y', end_date='-6m') for _ in range(n)],
        'loan_end_date': [fake.date_between(start_date='-6m', end_date='+1y') for _ in range(n)],
    })

def generate_credit_info(n, appids):
    return pd.DataFrame({
        'appid': np.random.choice(appids, size=n),
        'score': np.random.randint(300, 850, size=n),
        'riskgrade': np.random.choice(['A', 'B', 'C', 'D', 'E'], size=n),
        'click1': np.random.randint(0, 5, size=n),
        'click3': np.random.randint(0, 10, size=n),
        'click6': np.random.randint(0, 20, size=n),
        'click12': np.random.randint(0, 30, size=n),
        'probabilitydefault': np.random.rand(n),
        'total_loans_count': np.random.randint(0, 10, size=n),
        'total_loans_sum': np.round(np.random.uniform(0, 10000, size=n), 2),
        'live_overdue_days': np.random.randint(0, 90, size=n),
        'date': [fake.date_between(start_date='-1y', end_date='today') for _ in range(n)],
    })

def generate_income(n, appids, person_ids):
    return pd.DataFrame({
        'appid': np.random.choice(appids, size=n),
        'personid': np.random.choice(person_ids, size=n),
        'gross_income': np.round(np.random.uniform(500, 8000, size=n), 2),
        'net_income': np.round(np.random.uniform(400, 7000, size=n), 2),
        'income_type': np.random.choice(['salary', 'freelance', 'business'], size=n),
        'expenses': np.round(np.random.uniform(100, 4000, size=n), 2),
        'job_id': np.random.randint(1, 20, size=n),
        'date': [fake.date_between(start_date='-2y', end_date='today') for _ in range(n)],
    })

def generate_transactions(n, person_ids):
    return pd.DataFrame({
        'transactionid': range(1, n + 1),
        'personid': np.random.choice(person_ids, size=n),
        'transaction_code': np.random.randint(100, 200, size=n),
        'amount': np.round(np.random.uniform(5, 5000, size=n), 2),
        'currency': np.random.choice(['USD', 'EUR', 'GEL'], size=n),
        'merchant_id': np.random.randint(1, 1000, size=n),
        'date': [fake.date_between(start_date='-2y', end_date='today') for _ in range(n)],
    })

def generate_transactions_p2p(n, person_ids):
    return pd.DataFrame({
        'transactionid': range(1, n + 1),
        'personid_from': np.random.choice(person_ids, size=n),
        'personid_to': np.random.choice(person_ids, size=n),
        'transaction_code': np.random.randint(200, 300, size=n),
        'amount': np.round(np.random.uniform(1, 3000, size=n), 2),
        'currency': np.random.choice(['USD', 'EUR', 'GEL'], size=n),
        'date': [fake.date_between(start_date='-2y', end_date='today') for _ in range(n)],
    })

def generate_wof(n, person_ids, loan_ids):
    return pd.DataFrame({
        'personid': np.random.choice(person_ids, size=n),
        'loanid': np.random.choice(loan_ids, size=n),
        'date': [fake.date_between(start_date='-2y', end_date='today') for _ in range(n)]
    })

def generate_related_persons(n, person_ids):
    return pd.DataFrame({
        'personid': np.random.choice(person_ids, size=n),
        'related_personid': np.random.choice(person_ids, size=n),
        'relationship_id': np.random.randint(1, 6, size=n),
        'date': [fake.date_between(start_date='-5y', end_date='today') for _ in range(n)],
        'is_valid': np.random.choice([True, False], size=n)
    })

def generate_vwlob(n, loans):
    sampled_loans = loans.sample(n // 30, replace=True)  # simulate ~30 records per loan
    records = []

    for _, loan in sampled_loans.iterrows():
        start_date = pd.to_datetime(loan['loan_value_date'])
        end_date = pd.to_datetime(loan['loan_end_date'])
        duration = (end_date - start_date).days

        if duration < 20:
            continue  # skip unrealistically short loans

        daily_dates = pd.date_range(start=start_date, periods=min(duration, 30))
        overdue_days = []
        count = 0
        state = 'increasing'

        for _ in daily_dates:
            if state == 'increasing':
                count += 1
                overdue_days.append(count)
                if np.random.rand() < 0.1:
                    state = 'zero'
            elif state == 'zero':
                overdue_days.append(0)
                if np.random.rand() < 0.2:
                    count = 1
                    state = 'increasing'

        for dt, od in zip(daily_dates, overdue_days):
            records.append({
                'appid': loan['appid'],
                'loanid': loan['loanid'],
                'overdue_days': od,
                'monthly_date': dt
            })

            if len(records) >= n:
                break
        if len(records) >= n:
            break

    return pd.DataFrame.from_records(records)
    
# === GENERATE ===
persons_df = generate_persons(TARGET_ROW_COUNTS['persons'])
applications_df = generate_applications(TARGET_ROW_COUNTS['applications'], persons_df['personid'].tolist())
loans_df = generate_loans(TARGET_ROW_COUNTS['loans'], applications_df)
credit_info_df = generate_credit_info(TARGET_ROW_COUNTS['credit_info'], applications_df['appid'].tolist())
income_df = generate_income(TARGET_ROW_COUNTS['income'], applications_df['appid'].tolist(), persons_df['personid'].tolist())
transactions_df = generate_transactions(TARGET_ROW_COUNTS['transactions'], persons_df['personid'].tolist())
transactions_p2p_df = generate_transactions_p2p(TARGET_ROW_COUNTS['transactions_p2p'], persons_df['personid'].tolist())
wof_df = generate_wof(TARGET_ROW_COUNTS['wof'], persons_df['personid'].tolist(), loans_df['loanid'].tolist())
related_df = generate_related_persons(int(TARGET_ROW_COUNTS['persons'] * 0.5), persons_df['personid'].tolist())
vwlob_df = generate_vwlob(TARGET_ROW_COUNTS['vwlob'], loans_df)


# === SAVE AS PICKLE ===
synthetic_data = {
    'persons': persons_df,
    'applications': applications_df,
    'loans': loans_df,
    'credit_info': credit_info_df,
    'income': income_df,
    'transactions': transactions_df,
    'transactions_p2p': transactions_p2p_df,
    'wof': wof_df,
    'related_persons': related_df,
    'vwlob': vwlob_df
}


# === ESTIMATE MEMORY SIZES ===
print("\n=== Estimated Table Sizes (in-memory) ===")
total_size_bytes = 0
for name, df in synthetic_data.items():
    size_bytes = df.memory_usage(deep=True).sum()
    size_gb = size_bytes / (1024 ** 3)
    total_size_bytes += size_bytes
    print(f"{name:<20}: {size_gb:.3f} GB")

total_size_gb = total_size_bytes / (1024 ** 3)
print(f"\n{'TOTAL':<20}: {total_size_gb:.3f} GB")



with open('./synthetic_data/synthetic_dataset.pkl', 'wb') as f:
    pickle.dump(synthetic_data, f)