In [1]:
%%capture
%pip install -U 'rockfish[labs]' -f 'https://docs142.rockfish.ai/packages/index.html'

In [2]:
import rockfish as rf
import rockfish.actions as ra

Please replace `YOUR_API_KEY` with the assigned API key string. Note that it should be without quotes.

For example, if the assigned API Key is `abcd1234`, you can do the following
```python
%env ROCKFISH_API_KEY=abcd1234
conn = rf.Connection.from_env()
```
If you do not have API Key, please reach out to support@rockfish.ai.

In [3]:
%env ROCKFISH_API_KEY=YOUR_API_KEY
conn = rf.Connection.from_env()

env: ROCKFISH_API_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpYXQiOjE3MTIyNDM1OTEsImlzcyI6ImFwaSIsIm5iZiI6MTcxMjI0MzU5MSwidG9rZW5faWQiOiI0WEZBVEp1QWc2VGtudkdSSVZ0UHllIiwidXNlcl9pZCI6IjRVazVITDVra3lkQ0JmU0loMUhpQVcifQ.WH2uUN4wJ-SjCJiNWervlnMCiJCnG-xJoPDEgFMM-Ak


In [4]:
# download our example of timeseries data: pcap.csv
!wget --no-clobber https://docs142.rockfish.ai/tutorials/pcap.csv

File ‘pcap.csv’ already there; not retrieving.



In [5]:
dataset = rf.Dataset.from_csv("DC pcap", "pcap.csv")
dataset.to_pandas()

Unnamed: 0,srcip,dstip,srcport,dstport,proto,timestamp,pkt_len
0,244.3.253.224,244.3.160.239,3396,80,6,2009-12-17 16:27:36.075494,40
1,41.177.26.91,68.157.168.194,80,65003,6,2009-12-17 16:27:36.075515,1500
2,41.177.26.91,68.157.168.194,80,65003,6,2009-12-17 16:27:36.075519,940
3,41.177.26.91,68.157.168.194,80,65003,6,2009-12-17 16:27:36.075553,1500
4,41.177.26.91,68.157.168.194,80,65003,6,2009-12-17 16:27:36.075603,1500
...,...,...,...,...,...,...,...
95,68.157.168.194,41.177.26.91,45615,80,6,2009-12-17 16:27:36.099423,60
96,41.177.26.91,68.157.168.194,80,45615,6,2009-12-17 16:27:36.099891,64
97,41.177.3.203,41.177.3.224,58381,1791,6,2009-12-17 16:27:36.100508,40
98,244.3.41.84,244.3.31.67,2626,1592,6,2009-12-17 16:27:36.105025,237


### Word2vec in config
Define the fields of "srcip", "dstip", "srcport", "dstport" and "proto" to be used in the word2vec embedding

In [6]:
config = ra.TrainTimeGAN.Config(
    encoder=ra.TrainTimeGAN.DatasetConfig(
        timestamp=ra.TrainTimeGAN.TimestampConfig(field="timestamp"),
        metadata=[
            ra.TrainTimeGAN.FieldConfig(
                field="srcip", type="categorical", semantic_type="ip"
            ),
            ra.TrainTimeGAN.FieldConfig(
                field="dstip", type="categorical", semantic_type="ip"
            ),
            ra.TrainTimeGAN.FieldConfig(
                field="srcport", type="categorical", semantic_type="port"
            ),
            ra.TrainTimeGAN.FieldConfig(
                field="dstport", type="categorical", semantic_type="port"
            ),
            ra.TrainTimeGAN.FieldConfig(
                field="proto", type="categorical", semantic_type="proto"
            ),
        ],
        embedding=ra.TrainTimeGAN.EmbeddingConfig(
            type="word2vec", fields=["srcip", "srcport", "dstip", "dstport", "proto"]
        ),
        measurements=[ra.TrainTimeGAN.FieldConfig(field="pkt_len")],
    ),
    doppelganger=ra.TrainTimeGAN.DGConfig(
        sample_len=2, epoch=10, epoch_checkpoint_freq=10, batch_size=2, sessions=100
    ),
)

# create train action
train = ra.TrainTimeGAN(config)

In [7]:
builder = rf.WorkflowBuilder()
builder.add_dataset(dataset)
builder.add_action(train, parents=[dataset])
workflow = await builder.start(conn)

print(f"Workflow: {workflow.id()}")

Workflow: 2Erma8uxlzsgVsQz2hV8mo


In [8]:
async for progress in workflow.progress().notebook():
    pass

  0%|          | 0/10 [00:00<?, ?it/s]

In [9]:
model = await workflow.models().last()
model

Model('a7472bef-40ad-11ef-8c4e-8a07ae1c625c')

In [10]:
generate = ra.GenerateTimeGAN(config)
save = ra.DatasetSave({"name": "synthetic"})
builder = rf.WorkflowBuilder()
builder.add_model(model)
builder.add_action(generate, parents=[model])
builder.add_action(save, parents=[generate])
workflow = await builder.start(conn)
print(f"Workflow: {workflow.id()}")

Workflow: 3azxfHjcJGahjXyUknZfAf


In [11]:
async for progress in workflow.progress().notebook():
    pass

  0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
syn = None
async for sds in workflow.datasets():
    syn = await sds.to_local(conn)
syn.to_pandas()

Unnamed: 0,timestamp,pkt_len,session_key,srcip,srcport,dstip,dstport,proto
0,2009-12-17 16:27:23.464,1602,0.0,244.3.153.25,23,244.3.31.67,1791,6
1,2009-12-17 16:27:23.464,1429,0.0,244.3.153.25,1592,244.3.31.67,1791,6
2,2009-12-17 16:27:23.464,1540,0.0,244.3.153.25,23,244.3.31.67,1791,6
3,2009-12-17 16:27:23.464,1697,1.0,244.3.153.25,23,244.3.31.67,50323,17
4,2009-12-17 16:27:23.464,1670,1.0,244.3.153.25,45615,244.3.31.67,1592,17
...,...,...,...,...,...,...,...,...
231,2009-12-17 16:27:23.465,1726,98.0,244.3.153.25,8166,244.3.160.80,1791,6
232,2009-12-17 16:27:23.465,1648,98.0,244.3.153.25,8166,244.3.160.80,1592,6
233,2009-12-17 16:27:23.465,1670,98.0,244.3.153.25,8166,244.3.160.80,1791,6
234,2009-12-17 16:27:23.465,1695,98.0,244.3.153.25,8166,244.3.160.80,1791,6
