In [1]:
!pip install gretel-client openai torch transformers pandas


Collecting gretel-client
  Downloading gretel_client-0.29.0-py3-none-any.whl.metadata (4.2 kB)
Collecting openai
  Downloading openai-1.88.0-py3-none-any.whl.metadata (25 kB)
Collecting transformers
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting click<9,>=8.1.3 (from gretel-client)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting inflection==0.5.1 (from gretel-client)
  Downloading inflection-0.5.1-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting networkx==3.0 (from gretel-client)
  Downloading networkx-3.0-py3-none-any.whl.metadata (5.1 kB)
Collecting pyarrow==19.0.1 (from gretel-client)
  Downloading pyarrow-19.0.1-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting pycryptodome<4,>=3.19 (from gretel-client)
  Downloading pycryptodome-3.23.0-cp37-abi3-win_amd64.whl.metadata (3.5 kB)
Collecting pydantic>=2 (from gretel-client)
  Downloading pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting rich>=13.7 (from gretel-

In [3]:
import pandas as pd
from gretel_client import configure_session
from gretel_client.projects import create_or_get_unique_project
from gretel_client.helpers import poll
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [None]:
from gretel_client import configure_session

session = configure_session(api_key="grt_your_real_key", validate=True)
print("✅ API key is valid.")


In [32]:

# === Step 1: Configure Gretel ===
configure_session(api_key="grtuda225cfaf130a33a1af5af2d2122564630efbbac068cc1fc02cdcf83d6f6c97d", cache="yes", validate=True)
project = create_or_get_unique_project(name="gretel-lstm-gpt-synthetic")

# === Step 2: Load your CSV files (main + lookup) ===
customer_df = pd.read_csv('Customer.csv')
policy_df = pd.read_csv('Policy.csv')
claim_df = pd.read_csv('Claim.csv')
beneficiary_df = pd.read_csv('Beneficiary.csv')
coverage_df = pd.read_csv('Coverage.csv')
premium_df = pd.read_csv('Premium.csv')

policy_status_df = pd.read_csv('PolicyStatus.csv')
policy_type_df = pd.read_csv('PolicyType.csv')
payment_method_df = pd.read_csv('PaymentMethod.csv')


Logged in as nidyat@hexaware.com ✅


In [35]:


# === Step 3: Initialize GPT2 for description generation ===
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

def generate_description(seed_text="Claim details: ", max_length=30):
    inputs = tokenizer.encode(seed_text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# === Step 4: Function to train Gretel model and generate synthetic data ===
def train_and_generate_synthetic(df, table_name, num_samples):
    print(f"Training Gretel model for {table_name}...")
    model = project.create_model_obj(model_config="tabular_gan.yaml", data_source=df)
    model.submit()
    poll(model)  # wait for training to complete
    print(f"Generating {num_samples} synthetic records for {table_name}...")
    records = model.get_records(num_records=num_samples)
    synth_df = pd.DataFrame([r['data'] for r in records])
    return synth_df

# === Step 5: Specify how many rows you want per table ===
num_rows = {
    'Customer': 100000,
    'Policy': 200000,
    'Claim': 350000,
    'Beneficiary': 300000,
    'Coverage': 200000,
    'Premium': 250000
}

# === Step 6: Generate synthetic data for each main table ===
synthetic_customer = train_and_generate_synthetic(customer_df, 'Customer', num_rows['Customer'])
synthetic_policy = train_and_generate_synthetic(policy_df, 'Policy', num_rows['Policy'])
synthetic_claim = train_and_generate_synthetic(claim_df, 'Claim', num_rows['Claim'])
synthetic_beneficiary = train_and_generate_synthetic(beneficiary_df, 'Beneficiary', num_rows['Beneficiary'])
synthetic_coverage = train_and_generate_synthetic(coverage_df, 'Coverage', num_rows['Coverage'])
synthetic_premium = train_and_generate_synthetic(premium_df, 'Premium', num_rows['Premium'])

# === Step 7: Manually fix foreign keys to maintain referential integrity ===

# Use lookup tables (sampled with replacement) for FK columns in Policy
synthetic_policy['status_id'] = policy_status_df['status_id'].sample(n=len(synthetic_policy), replace=True).values
synthetic_policy['type_id'] = policy_type_df['type_id'].sample(n=len(synthetic_policy), replace=True).values
synthetic_policy['customer_id'] = synthetic_customer['customer_id'].sample(n=len(synthetic_policy), replace=True).values

# Fix FK columns for other main tables referencing Policy and PaymentMethod
synthetic_claim['policy_id'] = synthetic_policy['policy_id'].sample(n=len(synthetic_claim), replace=True).values

synthetic_beneficiary['policy_id'] = synthetic_policy['policy_id'].sample(n=len(synthetic_beneficiary), replace=True).values

synthetic_coverage['policy_id'] = synthetic_policy['policy_id'].sample(n=len(synthetic_coverage), replace=True).values
synthetic_coverage['coverage_type_id'] = policy_type_df['type_id'].sample(n=len(synthetic_coverage), replace=True).values  # assuming coverage_type_id linked to PolicyType

synthetic_premium['policy_id'] = synthetic_policy['policy_id'].sample(n=len(synthetic_premium), replace=True).values
synthetic_premium['payment_method_id'] = payment_method_df['method_id'].sample(n=len(synthetic_premium), replace=True).values

# === Step 8: Generate synthetic 'description' field for Claim using GPT2 ===
synthetic_claim['description'] = synthetic_claim['description'].apply(lambda x: generate_description("Claim details: "))

# === Step 9: Save all synthetic datasets ===
synthetic_customer.to_csv('synthetic_Customer.csv', index=False)
synthetic_policy.to_csv('synthetic_Policy.csv', index=False)
synthetic_claim.to_csv('synthetic_Claim.csv', index=False)
synthetic_beneficiary.to_csv('synthetic_Beneficiary.csv', index=False)
synthetic_coverage.to_csv('synthetic_Coverage.csv', index=False)
synthetic_premium.to_csv('synthetic_Premium.csv', index=False)

print("Synthetic data generation complete and saved to CSVs.")


Training Gretel model for Customer...


ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'date': 'Thu, 19 Jun 2025 05:13:42 GMT', 'server': 'uvicorn', 'content-type': 'application/json', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'content-length': '204'})
HTTP response body: {"message": "Invalid Gretel Configuration", "context": {"validation_errors": [{"loc": ["models"], "msg": "Unknown model in the config 'synthetics/tabular-gan'", "type": "value_error"}]}, "error_id": null}


In [1]:
from gretel_client import configure_session
from gretel_client.projects import create_or_get_unique_project
from gretel_client.helpers import poll
import pandas as pd

# Paste your actual Gretel API key here
configure_session(
    api_key="grtuda225cfaf130a33a1af5af2d2122564630efbbac068cc1fc02cdcf83d6f6c97d", 
    cache="yes", 
    validate=True
)

# Create or get your project
project = create_or_get_unique_project(name="gretel-lstm-gpt-synthetic")


Logged in as nidyat@hexaware.com ✅


In [2]:
# === Step 2: Load all data ===
customer_df = pd.read_csv("Customer.csv")
policy_df = pd.read_csv("Policy.csv")
claim_df = pd.read_csv("Claim.csv")
beneficiary_df = pd.read_csv("Beneficiary.csv")
coverage_df = pd.read_csv("Coverage.csv")
premium_df = pd.read_csv("Premium.csv")

# Lookup tables
policy_status_df = pd.read_csv("PolicyStatus.csv")
policy_type_df = pd.read_csv("PolicyType.csv")
coverage_type_df = pd.read_csv("CoverageType.csv")
payment_method_df = pd.read_csv("PaymentMethod.csv")

In [4]:
payment_method_df = payment_method_df.rename(columns={"method_id": "payment_method_id"})

In [5]:
# === Step 3: Merge lookup tables for training (temporary) ===
policy_df = policy_df.merge(policy_status_df, on="status_id", how="left")
policy_df = policy_df.merge(policy_type_df, on="type_id", how="left")
coverage_df = coverage_df.merge(coverage_type_df, on="coverage_type_id", how="left")
premium_df = premium_df.merge(payment_method_df, on="payment_method_id", how="left")


In [23]:
pip install --upgrade gretel-client


Note: you may need to restart the kernel to use updated packages.


In [17]:
import yaml

actgan_config = {
    "schema_version": "1.0",
    "name": "tabular-synthesizer",
    "models": [
        {
            "gretel/synthetics": {
                "data_source": "data_source",
                "params": {
                    "model_type": "actgan",  # latest supported model
                    "epochs": 100,
                    "batch_size": 500,
                    "gen_temp": 1.0
                }
            }
        }
    ]
}

with open("actgan_config.yaml", "w") as f:
    yaml.dump(actgan_config, f)



In [6]:
# === Step 4: Train and Generate Synthetic Data ===

def train_and_generate(df, table_name, num_records):
    print(f"Training model for {table_name}...")
    model = project.create_model_obj(model_config="tabular/actgan", data_source=df)
    model.submit()
    poll(model)
    
    # Download the synthetic records
    records_url = model.get_artifact_link("synthetic_data")
    df_synth = pd.read_csv(records_url)
    df_synth["__source_table__"] = table_name
    return df_synth.head(num_records)



In [7]:
# === Step 5: Run generation ===
synth_customer = train_and_generate(customer_df, "Customer", 100000)
synth_policy = train_and_generate(policy_df, "Policy", 200000)
synth_claim = train_and_generate(claim_df, "Claim", 300000)
synth_beneficiary = train_and_generate(beneficiary_df, "Beneficiary", 250000)
synth_coverage = train_and_generate(coverage_df, "Coverage", 300000)
synth_premium = train_and_generate(premium_df, "Premium", 250000)

# === Step 6: Save the outputs ===
synth_customer.to_csv("synthetic_customer.csv", index=False)
synth_policy.to_csv("synthetic_policy.csv", index=False)
synth_claim.to_csv("synthetic_claim.csv", index=False)
synth_beneficiary.to_csv("synthetic_beneficiary.csv", index=False)
synth_coverage.to_csv("synthetic_coverage.csv", index=False)
synth_premium.to_csv("synthetic_premium.csv", index=False)

print("✅ All synthetic datasets generated and saved.")

Training model for Customer...


ModelConfigError: Could not find model config 'tabular/actgan'

In [8]:
from gretel_client.projects.models import list_model_configs

print(list_model_configs())


ImportError: cannot import name 'list_model_configs' from 'gretel_client.projects.models' (C:\Users\2000147874\Downloads\myenv\Lib\site-packages\gretel_client\projects\models.py)

In [29]:
import pandas as pd
from gretel_client import configure_session
from gretel_client.projects import create_or_get_unique_project
from gretel_client.helpers import poll
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# === Step 1: Configure Gretel ===
configure_session(api_key="grtuda225cfaf130a33a1af5af2d2122564630efbbac068cc1fc02cdcf83d6f6c97d", cache="yes", validate=True)
project = create_or_get_unique_project(name="gretel-tabular-gan-synthetic")

Logged in as nidyat@hexaware.com ✅


In [12]:
yaml_config = """
schema_version: "1.0"
models:
  - tabular_gan:
      data_source: __tmp__
      params:
        epochs: auto
        generator_dim: [1024, 1024]
        discriminator_dim: [1024, 1024]
        generator_lr: 0.0001
        discriminator_lr: 0.00033
        batch_size: auto
        auto_transform_datetimes: False
"""

with open("tabular_gan.yaml", "w") as file:
    file.write(yaml_config.strip())


In [30]:
model = project.create_model_obj(model_config=model_config, data_source=table_df)
model.submit()
model.wait_for_completion()


NameError: name 'table_df' is not defined

In [28]:


# === Step 2: Load your CSV files ===
customer_df = pd.read_csv('Customer.csv')
policy_df = pd.read_csv('Policy.csv')
claim_df = pd.read_csv('Claim.csv')
beneficiary_df = pd.read_csv('Beneficiary.csv')
coverage_df = pd.read_csv('Coverage.csv')
premium_df = pd.read_csv('Premium.csv')

policy_status_df = pd.read_csv('PolicyStatus.csv')
policy_type_df = pd.read_csv('PolicyType.csv')
payment_method_df = pd.read_csv('PaymentMethod.csv')

# === Step 3: Initialize GPT2 for descriptions ===
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

def generate_description(seed_text="Claim details: ", max_length=30):
    inputs = tokenizer.encode(seed_text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# === Step 4: Function to train Gretel Tabular GAN and generate synthetic data ===
def train_and_generate_tabular_gan(df, table_name, num_samples):
    print(f"Training Gretel Tabular GAN model for {table_name}...")
    
    # Create model specifying the Tabular GAN model config (from Gretel docs)
    model = project.create_model_obj(
        model_config="tabular_gan.yaml",
        data_source=df
    )
    
    model.submit()
    poll(model)  # wait until training completes
    
    print(f"Generating {num_samples} synthetic records for {table_name}...")
    records = model.get_records(num_records=num_samples)
    synth_df = pd.DataFrame([r['data'] for r in records])
    return synth_df

# === Step 5: Specify how many synthetic rows per main table ===
num_rows = {
    'Customer': 100000,
    'Policy': 200000,
    'Claim': 250000,
    'Beneficiary': 300000,
    'Coverage': 250000,
    'Premium': 200000
}

# === Step 6: Generate synthetic datasets for main tables ===
synthetic_customer = train_and_generate_tabular_gan(customer_df, 'Customer', num_rows['Customer'])
synthetic_policy = train_and_generate_tabular_gan(policy_df, 'Policy', num_rows['Policy'])
synthetic_claim = train_and_generate_tabular_gan(claim_df, 'Claim', num_rows['Claim'])
synthetic_beneficiary = train_and_generate_tabular_gan(beneficiary_df, 'Beneficiary', num_rows['Beneficiary'])
synthetic_coverage = train_and_generate_tabular_gan(coverage_df, 'Coverage', num_rows['Coverage'])
synthetic_premium = train_and_generate_tabular_gan(premium_df, 'Premium', num_rows['Premium'])

# === Step 7: Manually fix foreign keys to maintain referential integrity ===
synthetic_policy['status_id'] = policy_status_df['status_id'].sample(n=len(synthetic_policy), replace=True).values
synthetic_policy['type_id'] = policy_type_df['type_id'].sample(n=len(synthetic_policy), replace=True).values
synthetic_policy['customer_id'] = synthetic_customer['customer_id'].sample(n=len(synthetic_policy), replace=True).values

synthetic_claim['policy_id'] = synthetic_policy['policy_id'].sample(n=len(synthetic_claim), replace=True).values
synthetic_beneficiary['policy_id'] = synthetic_policy['policy_id'].sample(n=len(synthetic_beneficiary), replace=True).values
synthetic_coverage['policy_id'] = synthetic_policy['policy_id'].sample(n=len(synthetic_coverage), replace=True).values
synthetic_coverage['coverage_type_id'] = policy_type_df['type_id'].sample(n=len(synthetic_coverage), replace=True).values

synthetic_premium['policy_id'] = synthetic_policy['policy_id'].sample(n=len(synthetic_premium), replace=True).values
synthetic_premium['payment_method_id'] = payment_method_df['method_id'].sample(n=len(synthetic_premium), replace=True).values

# === Step 8: Generate GPT-based description for Claims ===
synthetic_claim['description'] = synthetic_claim['description'].apply(lambda x: generate_description("Claim details: "))

# === Step 9: Save synthetic data to CSV ===
synthetic_customer.to_csv('synthetic_Customer.csv', index=False)
synthetic_policy.to_csv('synthetic_Policy.csv', index=False)
synthetic_claim.to_csv('synthetic_Claim.csv', index=False)
synthetic_beneficiary.to_csv('synthetic_Beneficiary.csv', index=False)
synthetic_coverage.to_csv('synthetic_Coverage.csv', index=False)
synthetic_premium.to_csv('synthetic_Premium.csv', index=False)

print("Tabular GAN synthetic data generation complete and saved to CSV files.")


Training Gretel Tabular GAN model for Customer...


ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'date': 'Thu, 19 Jun 2025 04:56:32 GMT', 'server': 'uvicorn', 'content-type': 'application/json', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'content-length': '193'})
HTTP response body: {"message": "Invalid Gretel Configuration", "context": {"validation_errors": [{"loc": ["models"], "msg": "Unknown model in the config 'tabular-gan'", "type": "value_error"}]}, "error_id": null}


In [None]:
----------------------------------------------------------------------------------------------------------------------------

In [27]:
import os
import pandas as pd
from gretel_client import configure_session
from gretel_client.projects import create_or_get_unique_project

# === Step 1: Set your Gretel API Key (replace with your actual key) ===
os.environ["GRETEL_API_KEY"] = "grtuda225cfaf130a33a1af5af2d2122564630efbbac068cc1fc02cdcf83d6f6c97d"

# === Step 2: Configure the Gretel session ===
configure_session(api_key=os.environ["GRETEL_API_KEY"], validate=True)

# === Step 3: Load your CSV data files for all main tables ===
customer_df = pd.read_csv("Customer.csv")
policy_df = pd.read_csv("Policy.csv")
claim_df = pd.read_csv("Claim.csv")
beneficiary_df = pd.read_csv("Beneficiary.csv")
coverage_df = pd.read_csv("Coverage.csv")
premium_df = pd.read_csv("Premium.csv")

# Lookup tables are only references, so you do NOT generate synthetic data for them.

# === Step 4: Create or get the Gretel project ===
project = create_or_get_unique_project(name="gretel-tabular-gan-synthetic")

# === Step 5: Define the model config dictionary for tabular GAN ===
model_config = {
    "models": [
        {
            "tabular-gan": {
                "epochs": 100
            }
        }
    ]
}
# === Step 6: Define function to train and generate synthetic data ===
def train_and_generate(table_name, df, num_rows):
    print(f"Starting training for {table_name} with {len(df)} original rows...")
    
    # Create the Gretel model object for this table
    model = project.create_model_obj(model_config=model_config, data_source=df)
    
    # Submit the training job and wait for it to complete
    model.submit()
    model.wait_for_completion()
    
    # Download synthetic data CSV artifact
    synthetic_path = model.get_artifact("synthetic_data")
    synthetic_df = pd.read_csv(synthetic_path)
    
    # Sample the desired number of rows (with replacement if requested size > generated rows)
    if num_rows > len(synthetic_df):
        sampled_df = synthetic_df.sample(n=num_rows, replace=True, random_state=42)
    else:
        sampled_df = synthetic_df.sample(n=num_rows, random_state=42)
    
    # Save the sampled synthetic data to CSV
    output_file = f"{table_name}_synthetic.csv"
    sampled_df.to_csv(output_file, index=False)
    
    print(f"Synthetic data for {table_name} saved to: {output_file}\n")

# === Step 7: Generate synthetic data for each main table with desired sample sizes ===
train_and_generate("Customer", customer_df, 1000)
train_and_generate("Policy", policy_df, 1500)
train_and_generate("Claim", claim_df, 800)
train_and_generate("Beneficiary", beneficiary_df, 600)
train_and_generate("Coverage", coverage_df, 700)
train_and_generate("Premium", premium_df, 900)

print("All synthetic datasets generated successfully.")


Logged in as nidyat@hexaware.com ✅
Starting training for Customer with 5000 original rows...


ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'date': 'Thu, 19 Jun 2025 04:41:17 GMT', 'server': 'uvicorn', 'content-type': 'application/json', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'content-length': '193'})
HTTP response body: {"message": "Invalid Gretel Configuration", "context": {"validation_errors": [{"loc": ["models"], "msg": "Unknown model in the config 'tabular-gan'", "type": "value_error"}]}, "error_id": null}


In [25]:
import pkg_resources
print(pkg_resources.get_distribution("gretel-client").version)



  import pkg_resources


0.29.0


In [26]:
from gretel_client.models import get_available_models

print(get_available_models())


ImportError: cannot import name 'get_available_models' from 'gretel_client.models' (C:\Users\2000147874\Downloads\myenv\Lib\site-packages\gretel_client\models\__init__.py)