In [1]:
import pandas as pd
import pickle
from sdv.metadata import Metadata
from sdv.multi_table import HMASynthesizer



In [7]:
# Step 1: Load your CSVs (or replace with your actual data loading)
customer_df = pd.read_csv('Customer.csv')
policy_df = pd.read_csv('Policy.csv')
claim_df = pd.read_csv('Claim.csv')
beneficiary_df = pd.read_csv('Beneficiary.csv')
coverage_df = pd.read_csv('Coverage.csv')
paymentmethod_df = pd.read_csv('PaymentMethod.csv')   # Lookup table
policystatus_df = pd.read_csv('PolicyStatus.csv')     # Lookup table
policytype_df = pd.read_csv('PolicyType.csv')         # Lookup table
premium_df = pd.read_csv('Premium.csv')

# Step 2: Define metadata for each table (example for main tables)
# For lookup tables, just specify as static categorical data in main tables.

customer_metadata = Metadata()
customer_metadata.add_table(
    table_name='Customer',
    columns={
        'customer_id': {'sdtype': 'id'},
        'first_name': {'sdtype': 'categorical'},
        'last_name': {'sdtype': 'categorical'},
        'date_of_birth': {'sdtype': 'datetime'},
        'address': {'sdtype': 'address'},
        'phone_number': {'sdtype': 'phone_number'}
    },
    primary_key='customer_id'
)
# Example: Customize field types if needed (dates, categorical, etc.)
customer_metadata.update_column('customer_id',table_name='Customer', sdtype='id')
customer_metadata.update_column('date_of_birth',table_name='Customer', sdtype='datetime')

# Policy Metadata
policy_metadata = Metadata()
policy_metadata.detect_from_dataframe(data=policy_df)
policy_metadata.update_column('policy_id', sdtype='id')
policy_metadata.update_column('start_date', sdtype='datetime')
policy_metadata.update_column('end_date', sdtype='datetime')
policy_metadata.update_column('customer_id', sdtype='id_reference', ref_table='customer', ref_field='customer_id')
policy_metadata.update_column('status_id', sdtype='categorical')  # linked to policystatus
policy_metadata.update_column('type_id', sdtype='categorical')    # linked to policytype

# Claim Metadata
claim_metadata = Metadata()
claim_metadata.detect_from_dataframe(data=claim_df)
claim_metadata.update_column('claim_id', sdtype='id')
claim_metadata.update_column('policy_id', sdtype='id_reference', ref_table='policy', ref_field='policy_id')
claim_metadata.update_column('date_filed', sdtype='datetime')

# Beneficiary Metadata
beneficiary_metadata = Metadata()
beneficiary_metadata.detect_from_dataframe(data=beneficiary_df)
beneficiary_metadata.update_column('beneficiary_id', sdtype='id')
beneficiary_metadata.update_column('policy_id', sdtype='id_reference', ref_table='policy', ref_field='policy_id')
beneficiary_metadata.update_column('date_of_birth', sdtype='datetime')

# Coverage Metadata
coverage_metadata = Metadata()
coverage_metadata.detect_from_dataframe(data=coverage_df)
coverage_metadata.update_column('coverage_id', sdtype='id')
coverage_metadata.update_column('policy_id', sdtype='id_reference', ref_table='policy', ref_field='policy_id')
coverage_metadata.update_column('coverage_type_id', sdtype='categorical')  # Assume coverage_type lookup

# Premium Metadata
premium_metadata = Metadata()
premium_metadata.detect_from_dataframe(data=premium_df)
premium_metadata.update_column('premium_id', sdtype='id')
premium_metadata.update_column('policy_id', sdtype='id_reference', ref_table='policy', ref_field='policy_id')
premium_metadata.update_column('payment_method_id', sdtype='categorical')  # linked to paymentmethod
premium_metadata.update_column('due_date', sdtype='datetime')



TypeError: MultiTableMetadata.add_table() got an unexpected keyword argument 'columns'

In [None]:
# Step 3: Train HMASynthesizer models separately for each main table

def train_and_save_model(data, metadata, model_filename, epochs=50):
    print(f"Training model and saving to {model_filename} ...")
    model = HMASynthesizer(metadata=metadata)
    model.fit(data, epochs=epochs)
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved: {model_filename}")

# Train models for each table
train_and_save_model(customer_df, customer_metadata, 'customer_model.pkl')
train_and_save_model(policy_df, policy_metadata, 'policy_model.pkl')
train_and_save_model(claim_df, claim_metadata, 'claim_model.pkl')
train_and_save_model(beneficiary_df, beneficiary_metadata, 'beneficiary_model.pkl')
train_and_save_model(coverage_df, coverage_metadata, 'coverage_model.pkl')
train_and_save_model(premium_df, premium_metadata, 'premium_model.pkl')

# Step 4: Function to load model and generate synthetic data later

def load_model_and_generate(model_filename, num_samples=100):
    with open(model_filename, 'rb') as f:
        model = pickle.load(f)
    synthetic_data = model.sample(num_samples)
    return synthetic_data

# Usage example:
if __name__ == "__main__":
    # Generate 10 synthetic customers
    synth_customers = load_model_and_generate('customer_model.pkl', num_samples=10)
    print(synth_customers.head())
    
    # Generate 5 synthetic policies
    synth_policies = load_model_and_generate('policy_model.pkl', num_samples=5)
    print(synth_policies.head())


In [17]:
import sdv
print(sdv.__version__)



1.22.1


In [9]:
### Step 0: Imports and setup
import pandas as pd
from sdv.metadata import SingleTableMetadata, MultiTableMetadata
from sdv.multi_table import HMASynthesizer
from sdv.single_table import CTGANSynthesizer
import pickle

### Step 1: Load all data (update filenames if needed)
customer_df = pd.read_csv("Customer.csv")
policy_df = pd.read_csv("Policy.csv")
claim_df = pd.read_csv("Claim.csv")
beneficiary_df = pd.read_csv("Beneficiary.csv")
coverage_df = pd.read_csv("Coverage.csv")
premium_df = pd.read_csv("Premium.csv")
policy_status_df = pd.read_csv("PolicyStatus.csv")
policy_type_df = pd.read_csv("PolicyType.csv")
payment_method_df = pd.read_csv("PaymentMethod.csv")

# Combine all tables into a dictionary
all_data = {
    'Customer': customer_df,
    'Policy': policy_df,
    'Claim': claim_df,
    'Beneficiary': beneficiary_df,
    'Coverage': coverage_df,
    'Premium': premium_df,
    'PolicyStatus': policy_status_df,
    'PolicyType': policy_type_df,
    'PaymentMethod': payment_method_df
}



In [10]:
### Step 2: Auto-generate single-table metadata
from sdv.metadata import SingleTableMetadata

metadata_dict = {}
for table_name, df in all_data.items():
    stm = SingleTableMetadata()
    stm.detect_from_dataframe(df)
    metadata_dict[table_name] = stm.to_dict()


In [12]:
### Step 3: Define relationships and create MultiTableMetadata
multi_table_metadata_dict = {
    'tables': metadata_dict,
    'relationships': [
        {'parent_table': 'Customer', 'parent_primary_key': 'customer_id', 'child_table': 'Policy', 'child_foreign_key': 'customer_id'},
        {'parent_table': 'PolicyStatus', 'parent_primary_key': 'status_id', 'child_table': 'Policy', 'child_foreign_key': 'status_id'},
        {'parent_table': 'PolicyType', 'parent_primary_key': 'type_id', 'child_table': 'Policy', 'child_foreign_key': 'type_id'},
        {'parent_table': 'Policy', 'parent_primary_key': 'policy_id', 'child_table': 'Claim', 'child_foreign_key': 'policy_id'},
        {'parent_table': 'Policy', 'parent_primary_key': 'policy_id', 'child_table': 'Beneficiary', 'child_foreign_key': 'policy_id'},
        {'parent_table': 'Policy', 'parent_primary_key': 'policy_id', 'child_table': 'Coverage', 'child_foreign_key': 'policy_id'},
        {'parent_table': 'Policy', 'parent_primary_key': 'policy_id', 'child_table': 'Premium', 'child_foreign_key': 'policy_id'},
        {'parent_table': 'PaymentMethod', 'parent_primary_key': 'method_id', 'child_table': 'Premium', 'child_foreign_key': 'payment_method_id'}
    ]
}

metadata = MultiTableMetadata()
metadata.load_from_dict(multi_table_metadata_dict)


{
    "tables": {
        "Customer": {
            "columns": {
                "customer_id": {
                    "sdtype": "id"
                },
                "first_name": {
                    "pii": true,
                    "sdtype": "first_name"
                },
                "last_name": {
                    "pii": true,
                    "sdtype": "last_name"
                },
                "date_of_birth": {
                    "datetime_format": "%m/%d/%Y",
                    "sdtype": "datetime"
                },
                "address1": {
                    "sdtype": "id"
                },
                "phone_number": {
                    "pii": true,
                    "sdtype": "phone_number"
                }
            },
            "primary_key": "address1"
        },
        "Policy": {
            "columns": {
                "policy_id": {
                    "sdtype": "id"
                },
                "policy_number": {
       

In [17]:
import pandas as pd
from sdv.metadata import MultiTableMetadata, Metadata
from sdv.multi_table import HMASynthesizer
from sdv.single_table import CTGANSynthesizer
import pickle


In [18]:
customer_df = pd.read_csv('Customer.csv')
policy_df = pd.read_csv('Policy.csv')
claim_df = pd.read_csv('Claim.csv')
beneficiary_df = pd.read_csv('Beneficiary.csv')
coverage_df = pd.read_csv('Coverage.csv')
premium_df = pd.read_csv('Premium.csv')

# Lookup tables (used only for referential integrity)
policy_status_df = pd.read_csv('PolicyStatus.csv')
policy_type_df = pd.read_csv('PolicyType.csv')
payment_method_df = pd.read_csv('PaymentMethod.csv')

all_data = {
    'Customer': customer_df,
    'Policy': policy_df,
    'Claim': claim_df,
    'Beneficiary': beneficiary_df,
    'Coverage': coverage_df,
    'Premium': premium_df,
    'PolicyStatus': policy_status_df,
    'PolicyType': policy_type_df,
    'PaymentMethod': payment_method_df
}


In [19]:
metadata = MultiTableMetadata()
metadata.detect_table_from_data(name='Customer', data=customer_df, primary_key='customer_id')
metadata.detect_table_from_data(name='Policy', data=policy_df, primary_key='policy_id')
metadata.detect_table_from_data(name='Claim', data=claim_df, primary_key='claim_id')
metadata.detect_table_from_data(name='Beneficiary', data=beneficiary_df, primary_key='beneficiary_id')
metadata.detect_table_from_data(name='Coverage', data=coverage_df, primary_key='coverage_id')
metadata.detect_table_from_data(name='Premium', data=premium_df, primary_key='premium_id')

# Add lookup tables (read-only)
metadata.detect_table_from_data(name='PolicyStatus', data=policy_status_df, primary_key='status_id')
metadata.detect_table_from_data(name='PolicyType', data=policy_type_df, primary_key='type_id')
metadata.detect_table_from_data(name='PaymentMethod', data=payment_method_df, primary_key='method_id')

# Add relationships
metadata.add_relationship('Customer', 'customer_id', 'Policy', 'customer_id')
metadata.add_relationship('PolicyStatus', 'status_id', 'Policy', 'status_id')
metadata.add_relationship('PolicyType', 'type_id', 'Policy', 'type_id')
metadata.add_relationship('Policy', 'policy_id', 'Claim', 'policy_id')
metadata.add_relationship('Policy', 'policy_id', 'Beneficiary', 'policy_id')
metadata.add_relationship('Policy', 'policy_id', 'Coverage', 'policy_id')
metadata.add_relationship('Policy', 'policy_id', 'Premium', 'policy_id')
metadata.add_relationship('PaymentMethod', 'method_id', 'Premium', 'payment_method_id')


AttributeError: 'MultiTableMetadata' object has no attribute 'detect_table_from_data'