In [1]:
%%capture
%pip install -U 'rockfish[labs]' -f 'https://docs142.rockfish.ai/packages/index.html'

In [2]:
import rockfish as rf
import rockfish.actions as ra
import rockfish.labs as rl

Please replace `YOUR_API_KEY` with the assigned API key string. Note that it should be without quotes.

For example, if the assigned API Key is `abcd1234`, you can do the following
```python
%env ROCKFISH_API_KEY=abcd1234
conn = rf.Connection.from_env()
```
If you do not have API Key, please reach out to support@rockfish.ai.

In [1]:
%env ROCKFISH_API_KEY=YOUR_API_KEY
conn = rf.Connection.from_env()

In [4]:
# download our example of timeseries data: finance.csv
!wget --no-clobber https://docs142.rockfish.ai/tutorials/finance.csv

File ‘finance.csv’ already there; not retrieving.



In [5]:
dataset = rf.Dataset.from_csv("finance", "finance.csv")
dataset.to_pandas()

Unnamed: 0,customer,age,gender,merchant,category,amount,fraud,timestamp
0,C1093826151,4,M,M348934600,transportation,4.55,0,2023-01-01
1,C575345520,2,F,M348934600,transportation,76.67,0,2023-01-01
2,C1787537369,2,M,M1823072687,transportation,48.02,0,2023-01-01
3,C1732307957,5,F,M348934600,transportation,55.06,0,2023-01-01
4,C842799656,1,F,M348934600,transportation,25.62,0,2023-01-01
...,...,...,...,...,...,...,...,...
49995,C1971105040,3,M,M348934600,transportation,67.91,0,2023-01-20
49996,C51444479,3,M,M348934600,transportation,32.27,0,2023-01-20
49997,C1096642744,5,M,M1535107174,wellnessandbeauty,149.70,0,2023-01-20
49998,C1166683343,2,F,M1823072687,transportation,24.78,0,2023-01-20


In [6]:
config = ra.TrainTimeGAN.Config(
    encoder=ra.TrainTimeGAN.DatasetConfig(
        timestamp=ra.TrainTimeGAN.TimestampConfig(field="timestamp"),
        metadata=[
            ra.TrainTimeGAN.FieldConfig(field="customer", type="session"),
            ra.TrainTimeGAN.FieldConfig(field="age", type="categorical"),
            ra.TrainTimeGAN.FieldConfig(field="gender", type="categorical"),
        ],
        measurements=[
            ra.TrainTimeGAN.FieldConfig(field="merchant", type="categorical"),
            ra.TrainTimeGAN.FieldConfig(field="category", type="categorical"),
            ra.TrainTimeGAN.FieldConfig(field="amount"),
            ra.TrainTimeGAN.FieldConfig(field="fraud", type="categorical"),
        ],
    ),
    doppelganger=ra.TrainTimeGAN.DGConfig(
        sample_len=19, epoch=10, epoch_checkpoint_freq=10, batch_size=64, sessions=3765
    ),
)

# create train action
train = ra.TrainTimeGAN(config)

In [7]:
builder = rf.WorkflowBuilder()
builder.add_dataset(dataset)
builder.add_action(train, parents=[dataset])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: 3EocBynRhtK671WAkT8lOQ


In [8]:
async for progress in workflow.progress().notebook():
    pass

  0%|          | 0/10 [00:00<?, ?it/s]

In [9]:
model = await workflow.models().last()
model

Model('9c5cb559-409c-11ef-bc7f-067b3cbfba25')

### Update the generated sessions
e.g. specify the number of sessions by assigning value for `generate_config.doppelganger.sessions`

In [13]:
config.doppelganger.sessions = 8000  # update n_records
generate = ra.GenerateTimeGAN(config)
save = ra.DatasetSave({"name": "synthetic"})
builder = rf.WorkflowBuilder()
builder.add_model(model)
builder.add_action(generate, parents=[model])
builder.add_action(save, parents=[generate])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: 5ejVUpUXOwoPVH6RDJs77g


In [14]:
async for progress in workflow.progress().notebook():
    pass

  0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
syn = None
async for sds in workflow.datasets():
     syn = await sds.to_local(conn)
syn.to_pandas()

Unnamed: 0,timestamp,amount,age,gender,merchant,category,fraud,session_key
0,2023-01-03 09:07:21.654,200.137876,2,F,M348934600,transportation,0,0.0
1,2023-01-04 23:46:11.479,132.017201,2,F,M348934600,transportation,0,0.0
2,2023-01-06 08:23:43.047,139.231472,2,F,M348934600,transportation,0,0.0
3,2023-01-07 20:14:53.749,177.909514,2,F,M348934600,transportation,0,0.0
4,2023-01-09 02:27:56.955,170.312786,2,F,M348934600,transportation,0,0.0
...,...,...,...,...,...,...,...,...
113146,2036-01-27 12:30:48.514,535.556897,6,F,M348934600,transportation,0,7999.0
113147,2036-01-29 05:10:45.337,449.490122,6,F,M348934600,transportation,0,7999.0
113148,2036-01-31 01:40:25.449,631.612447,6,F,M348934600,transportation,0,7999.0
113149,2036-02-01 10:56:05.304,497.165011,6,F,M348934600,transportation,0,7999.0


### Generate large dataset
We recommend you to use our `SessionTarget` and please refer [here](https://docs142.rockfish.ai/data-gen.html#time-series-data) for details

In [16]:
session_target = ra.SessionTarget(target=16000)  # providing the target "sessions" value
save = ra.DatasetSave(
    name="target_synthetic", concat_tables=True, concat_session_key="session_key"
)  
builder = rf.WorkflowBuilder()
builder.add_model(model)
builder.add_action(generate, parents=[model, session_target])
builder.add_action(session_target, parents=[generate])
builder.add_action(save, parents=[generate])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: 5fjfLYaMAKiLRrfADR12Cu


In [17]:
async for log in workflow.logs():
    print(log)

2024-07-12T22:18:16Z generate-time-gan: INFO Downloading model with model_id='9c5cb559-409c-11ef-bc7f-067b3cbfba25'...
2024-07-12T22:18:18Z generate-time-gan: INFO Generating 8000 sessions...
2024-07-12T22:18:23Z session-target: INFO Grouping on: ['session_key']
2024-07-12T22:18:23Z dataset-save: INFO Saved dataset 'to0i03vMAnub3Ku3nlFrG' with 112290 rows
2024-07-12T22:18:24Z session-target: INFO new=8000 total=8000 needs=8000
2024-07-12T22:18:24Z generate-time-gan: INFO Downloading model with model_id='9c5cb559-409c-11ef-bc7f-067b3cbfba25'...
2024-07-12T22:18:26Z generate-time-gan: INFO Generating 8000 sessions...
2024-07-12T22:18:32Z session-target: INFO Grouping on: ['session_key']
2024-07-12T22:18:33Z session-target: INFO new=8000 total=16000 needs=0


In [18]:
syn_large = None
async for sds in workflow.datasets():
    syn_large = await sds.to_local(conn)
syn_large.to_pandas()

Unnamed: 0,timestamp,amount,age,gender,merchant,category,fraud,session_key
0,2023-01-05 12:36:50.923,184.080468,5,F,M1823072687,transportation,0,8000.0
1,2023-01-07 11:50:26.822,205.348190,5,F,M348934600,transportation,0,8000.0
2,2023-01-09 08:20:59.515,244.048895,5,F,M348934600,transportation,0,8000.0
3,2023-01-11 06:48:39.113,242.205529,5,F,M348934600,transportation,0,8000.0
4,2023-01-13 07:07:17.113,224.274713,5,F,M348934600,transportation,0,8000.0
...,...,...,...,...,...,...,...,...
225157,2023-01-12 10:51:55.484,126.156021,2,M,M348934600,transportation,0,7999.0
225158,2023-01-13 08:48:27.804,94.423101,2,M,M348934600,transportation,0,7999.0
225159,2023-01-14 04:59:21.891,125.890977,2,M,M348934600,transportation,0,7999.0
225160,2023-01-14 21:38:45.015,128.380141,2,M,M348934600,transportation,0,7999.0
