In [None]:

!pip install -U 'rockfish[labs]' -f 'https://packages.rockfish.ai'

In [None]:
# READ DATA FROM SNOWFLAKE TABLE
from snowflake.snowpark.context import get_active_session

# The table you want to read
TARGET_TABLE_FULL = '<DATABASE>.<SCHEMA>.<TABLE_NAME>'

try:
    # --- 1. Get Active Snowpark Session ---
    print("Getting active Snowpark session...")
    session = get_active_session()

    # --- 2. Create Snowpark DataFrame ---
    print(f"Creating Snowpark DataFrame for table: {TARGET_TABLE_FULL}")
    snowpark_df = session.table(TARGET_TABLE_FULL)

    df_to_convert = snowpark_df
    
    # --- 4. Convert to Pandas DataFrame ---
    print("Converting Snowpark DataFrame to pandas DataFrame...")
    # This executes the query and pulls data into the notebook's memory
    df = df_to_convert.to_pandas()
    print(f"Successfully fetched {len(df)} rows into pandas DataFrame.")
    print("DataFrame Head:")
    print(df.head())

except Exception as e:
    print(f"An error occurred: {e}")
    # Handle potential SnowparkSQLException, permission errors etc.

In [None]:
# Rockfish Environment Key & API URL
import rockfish as rf
import rockfish.actions as ra
import rockfish.labs as rl
import asyncio

runner = asyncio

api_key = "<YOUR_API_KEY>"
api_url = "https://api.rockfish.ai"
%env ROCKFISH_API_KEY=api_key
async def start_rockfish():
    conn = rf.Connection.remote(api_url, api_key)

    #Onboard
    # Perform any necessary feature engineering or preprocessing
    dataset = rf.Dataset.from_pandas("<DATASET_NAME>",df)
    
    categorical_fields = (
        df.select_dtypes(include=["object"]).columns
    )
    print(categorical_fields)
    
    config = {
        "encoder": {
            "metadata": [
                {"field": field, "type": "categorical"}
                for field in categorical_fields
            ]
            + [
                {"field": field, "type": "continuous"}
                for field in dataset.table.column_names
                if field not in categorical_fields
            ],
        },
        "tabular-gan": {
            "epochs": 10,
            "records": 10000,
        }
    }
    print(dataset.table.column_names)

    #Train
    train = ra.TrainTabGAN(config)
    
    builder = rf.WorkflowBuilder()
    builder.add_dataset(dataset)
    builder.add_action(train, parents=[dataset])
    workflow = await builder.start(conn)
    print(f"Training - Workflow: {workflow.id()}")
    
    async for log in workflow.logs():
        print(log) 
    
    model = await workflow.models().nth(0)
    await model.add_labels(conn)
    #model

    #Generate
    generate = ra.GenerateTabGAN(config)
    save = ra.DatasetSave({"name": "synthetic"})
    builder = rf.WorkflowBuilder()
    builder.add_model(model)
    builder.add_action(generate, parents=[model])
    builder.add_action(save, parents=[generate])
    workflow = await builder.start(conn)
    print(f"Generate - Workflow: {workflow.id()}")
    
    syn = None
    async for sds in workflow.datasets():
        syn = await sds.to_local(conn)
    
    return syn
syn_data = runner.run(start_rockfish())

import csv

syn_data_pandas = syn_data.to_pandas()

syn_data_pandas.to_csv(f"synthetic_demo.csv", index=False, quoting=csv.QUOTE_ALL)
print("VVVVVVVVVVVVVVVVVVVVVVVVVVVVV")
print("VVV Sample Synthetic Data VVV")
print("VVVVVVVVVVVVVVVVVVVVVVVVVVVVV")
syn_data_pandas.head()


In [None]:
import warnings
warnings.filterwarnings('ignore')
dataset = rf.Dataset.from_pandas("<DATASET_NAME>",df)

for col in dataset.table.column_names:
    source_agg = rf.metrics.count_all(dataset, col, nlargest=10)
    syn_agg = rf.metrics.count_all(syn_data, col, nlargest=10)
    rl.vis.plot_bar([source_agg, syn_agg], col, f"{col}_count")

In [None]:
!ls /home/app/synthetic_demo.csv

In [None]:
--TRUNCATE TABLE ROCKFISHDATADEMO.ROCKFISHDATADEMO.SYNTHETIC_RF_DEMO
CREATE OR REPLACE STAGE ROCKFISHDATADEMO.ROCKFISHDATADEMO.ROCKFISH_STAGE
    COMMENT = 'Stage for Rockfish demo data';

In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

!ls /home/app/synthetic_rf_demo.csv

notebook_file_path_str = '/home/app/synthetic_demo.csv'
target_stage = '@<DESTINATION_DATABASE>.<DESTINATION_SCHEMA>.<DESTINATION_STAGE_NAME>'

print(f"File found at: {notebook_file_path_str}")
# --- Prepare and Execute PUT command ---
file_uri = f"file://{str(notebook_file_path_str)}"
put_sql = f"PUT '{file_uri}' '{target_stage}' AUTO_COMPRESS=TRUE OVERWRITE=TRUE"
print(f"Executing via Snowpark session: {put_sql}")

# Execute and fetch results correctly using Snowpark/connector methods
put_results = session.sql(put_sql).collect()

print("PUT command executed successfully.")
print("Results:")
for row in put_results:
    print(row) # Connector handles getting status correctly

In [None]:
TRUNCATE TABLE <DESTINATION_DATABASE>.<DESTINATION_SCHEMA>.<DESTINATION_SYNTHETIC_DATA_TABLE_NAME>;

SELECT COUNT(*) FROM <DESTINATION_DATABASE>.<DESTINATION_SCHEMA>.<DESTINATION_SYNTHETIC_DATA_TABLE_NAME>;

In [None]:
COPY INTO <DESTINATION_DATABASE>.<DESTINATION_SCHEMA>.<DESTINATION_SYNTHETIC_DATA_TABLE_NAME>
FROM '@"<DESTINATION_DATABASE>"."<DESTINATION_SCHEMA>"."<DESTINATION_STAGE_NAME>"/synthetic_demo.csv.gz' -- Reference the file in the user stage (note .gz)
FILE_FORMAT = ( -- YOU MUST DEFINE THE FILE FORMAT
    TYPE = CSV
    FIELD_DELIMITER = ','
    SKIP_HEADER = 1 -- Adjust if your CSV has no header
    EMPTY_FIELD_AS_NULL = TRUE
    FIELD_OPTIONALLY_ENCLOSED_BY = '"'
)
ON_ERROR = 'CONTINUE'; -- Or other error handling options

In [None]:
SELECT COUNT(*) FROM <DESTINATION_DATABASE>.<DESTINATION_SCHEMA>.<DESTINATION_SYNTHETIC_DATA_TABLE_NAME>;

In [None]:
SELECT * FROM <DESTINATION_DATABASE>.<DESTINATION_SCHEMA>.<DESTINATION_SYNTHETIC_DATA_TABLE_NAME> LIMIT 4;