In [1]:
%%capture
%pip install -U 'rockfish[labs]' -f 'https://docs142.rockfish.ai/packages/index.html'

In [2]:
import rockfish as rf
import rockfish.actions as ra

Please replace `YOUR_API_KEY` with the assigned API key string. Note that it should be without quotes.

For example, if the assigned API Key is `abcd1234`, you can do the following
```python
%env ROCKFISH_API_KEY=abcd1234
conn = rf.Connection.from_env()
```
If you do not have API Key, please reach out to support@rockfish.ai.

In [3]:
%env ROCKFISH_API_KEY=YOUR_API_KEY
conn = rf.Connection.from_env()

env: ROCKFISH_API_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpYXQiOjE3MTIyNDM1OTEsImlzcyI6ImFwaSIsIm5iZiI6MTcxMjI0MzU5MSwidG9rZW5faWQiOiI0WEZBVEp1QWc2VGtudkdSSVZ0UHllIiwidXNlcl9pZCI6IjRVazVITDVra3lkQ0JmU0loMUhpQVcifQ.WH2uUN4wJ-SjCJiNWervlnMCiJCnG-xJoPDEgFMM-Ak


In [4]:
# download our example of timeseries data: pcap.csv
!wget --no-clobber https://docs142.rockfish.ai/tutorials/pcap.csv

File ‘pcap.csv’ already there; not retrieving.



In [5]:
dataset = rf.Dataset.from_csv("DC pcap", "pcap.csv")
dataset.to_pandas()

Unnamed: 0,srcip,dstip,srcport,dstport,proto,timestamp,pkt_len
0,244.3.253.224,244.3.160.239,3396,80,6,2009-12-17 16:27:36.075494,40
1,41.177.26.91,68.157.168.194,80,65003,6,2009-12-17 16:27:36.075515,1500
2,41.177.26.91,68.157.168.194,80,65003,6,2009-12-17 16:27:36.075519,940
3,41.177.26.91,68.157.168.194,80,65003,6,2009-12-17 16:27:36.075553,1500
4,41.177.26.91,68.157.168.194,80,65003,6,2009-12-17 16:27:36.075603,1500
...,...,...,...,...,...,...,...
95,68.157.168.194,41.177.26.91,45615,80,6,2009-12-17 16:27:36.099423,60
96,41.177.26.91,68.157.168.194,80,45615,6,2009-12-17 16:27:36.099891,64
97,41.177.3.203,41.177.3.224,58381,1791,6,2009-12-17 16:27:36.100508,40
98,244.3.41.84,244.3.31.67,2626,1592,6,2009-12-17 16:27:36.105025,237


In [6]:
config = {
    "encoder": {
        "timestamp": {"field": "timestamp"},
        "metadata": [
            {"field": "srcip", "type": "categorical"},
            {"field": "dstip", "type": "categorical"},
            {"field": "srcport", "type": "categorical"},
            {"field": "dstport", "type": "categorical"},
            {"field": "proto", "type": "categorical"},
        ],
        "measurements": [{"field": "pkt_len", "type": "continuous"}],
    },
    "rtf": {
        "mode": "relational",
        "num_bootstrap": 2,
        "parent": {
            "epochs": 1,
            "transformer": {"gpt2_config": {"layer": 1, "head": 1, "embed": 1}},
        },
        "child": {"output_max_length": 2048, "epochs": 1},
    },
}
# create train action
train = ra.TrainTransformer(config)

In [7]:
builder = rf.WorkflowBuilder()
builder.add_dataset(dataset)
builder.add_action(train, parents=[dataset])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: 1QF99UYVOo1KQxL1q16Pzg


In [8]:
async for log in workflow.logs():
    print(log)

2024-07-12T22:39:49Z dataset-load: INFO Loading dataset '3mBHBdc1eqoujm0CMFsQKG' with 100 rows
2024-07-12T22:40:22Z train-transformer: INFO Start training...
2024-07-12T22:40:36Z train-transformer: INFO Epoch 1 completed.
2024-07-12T22:41:16Z train-transformer: INFO Training completed. The Model ID is d7d37600-409f-11ef-8c4e-8a07ae1c625c


In [9]:
model = await workflow.models().last()
model

Model('d7d37600-409f-11ef-8c4e-8a07ae1c625c')

### Update the generated sessions

In [13]:
config["rtf"].update({"sessions": 200})
generate = ra.GenerateTransformer(config)
save = ra.DatasetSave({"name": "SyntheticData_large"})
builder = rf.WorkflowBuilder()
builder.add_model(model)
builder.add_action(generate, parents=[model])
builder.add_action(save, parents=[generate])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: 47MnGPmwpbNcf2a92qHuas


In [14]:
async for log in workflow.logs():
    print(log)

2024-07-12T22:41:17Z generate-transformer: INFO Starting download of Model d7d37600-409f-11ef-8c4e-8a07ae1c625c
2024-07-12T22:41:32Z generate-transformer: INFO Finished download of Model d7d37600-409f-11ef-8c4e-8a07ae1c625c
2024-07-12T22:41:32Z generate-transformer: INFO Start generating samples...
2024-07-12T22:41:39Z generate-transformer: INFO Finish generating samples...
2024-07-12T22:41:39Z dataset-save: INFO Saved dataset '4goFD7mQAfaIrwxU5x1GJt' with 1406 rows


In [15]:
syn = None
async for sds in workflow.datasets():
    syn = await sds.to_local(conn)
syn.to_pandas()

Unnamed: 0,pkt_len,timestamp,srcip,dstip,srcport,dstport,proto
0,20,2009-12-17 16:27:36,244.3.31.67,77.197.128.127,380,60663,6
1,140,2009-12-17 16:27:36,244.3.31.67,77.197.128.127,380,60663,6
2,1440,2009-12-17 16:27:36,244.3.31.67,77.197.128.127,380,60663,6
3,1120,2009-12-17 16:27:36,244.3.31.67,77.197.128.127,380,60663,6
4,1440,2009-12-17 16:27:36,244.3.31.67,77.197.128.127,380,60663,6
...,...,...,...,...,...,...,...
1401,1270,2009-12-17 16:27:36,244.3.160.80,244.3.160.80,6024,488,6
1402,640,2009-12-17 16:27:36,244.3.160.80,244.3.160.80,6024,488,6
1403,1110,2009-12-17 16:27:36,244.3.160.80,244.3.160.80,6024,488,6
1404,1153,2009-12-17 16:27:36,244.3.160.80,244.3.160.80,6024,488,6


### Generate large dataset
We recommend you to use our `SessionTarget` and please refer [here](https://docs142.rockfish.ai/data-gen.html#time-series-data) for details

In [16]:
'''If wanting to concat by `concat_session_key`, 
update the config to include the sessions_key
'''
config["rtf"].update({"sessions_flag": True})
generate = ra.GenerateTransformer(config)

session_target = ra.SessionTarget(target=1000)  # providing the target "sessions" value
save = ra.DatasetSave(
    name="target_synthetic", concat_tables=True, concat_session_key="session_key"
)
builder = rf.WorkflowBuilder()
builder.add_model(model)
builder.add_action(generate, parents=[model, session_target])
builder.add_action(session_target, parents=[generate])
builder.add_action(save, parents=[generate])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: 1RgkATjK2rJ1Z2HOnBoGqY


In [17]:
async for log in workflow.logs():
    print(log)

2024-07-12T22:41:40Z generate-transformer: INFO Starting download of Model d7d37600-409f-11ef-8c4e-8a07ae1c625c
2024-07-12T22:41:56Z generate-transformer: INFO Finished download of Model d7d37600-409f-11ef-8c4e-8a07ae1c625c
2024-07-12T22:41:56Z generate-transformer: INFO Start generating samples...
2024-07-12T22:42:03Z generate-transformer: INFO Finish generating samples...
2024-07-12T22:42:03Z generate-transformer: INFO Starting download of Model d7d37600-409f-11ef-8c4e-8a07ae1c625c
2024-07-12T22:42:03Z session-target: INFO Grouping on: ['srcip', 'dstip', 'srcport', 'dstport', 'proto']
2024-07-12T22:42:03Z dataset-save: INFO Saved dataset '78nPu7Sm84uTl9ZJu2QSSl' with 1406 rows
2024-07-12T22:42:03Z session-target: INFO new=183 total=183 needs=11817
2024-07-12T22:42:22Z generate-transformer: INFO Finished download of Model d7d37600-409f-11ef-8c4e-8a07ae1c625c
2024-07-12T22:42:22Z generate-transformer: INFO Start generating samples...
2024-07-12T22:42:29Z generate-transformer: INFO Fini

In [None]:
syn_large = None
async for sds in workflow.datasets():
    syn_large = await sds.to_local(conn)
syn_large.to_pandas()