In [1]:
import rockfish as rf
import rockfish.actions as ra
from rockfish.labs.dataset_properties import DatasetPropertyExtractor
from rockfish.labs.steps import Recommender, ModelSelection
from rockfish.labs.recommender import ModelType

### Load Real Data

In [2]:
dataset = rf.Dataset.from_csv("finance", './finance.csv')

In [3]:
dataset.to_pandas()

Unnamed: 0,customer,email,age,gender,merchant,category,amount,fraud,timestamp
0,C100045114,nmontgomery@example.net,4,M,M348934600,transportation,35.13,0,2023-01-01 00:00:00
1,C100045114,nmontgomery@example.net,4,M,M348934600,transportation,27.63,0,2023-01-01 08:00:00
2,C100045114,nmontgomery@example.net,4,M,M348934600,transportation,13.46,0,2023-01-01 16:00:00
3,C100045114,nmontgomery@example.net,4,M,M348934600,transportation,28.86,0,2023-01-02 00:00:00
4,C100045114,nmontgomery@example.net,4,M,M151143676,barsandrestaurants,64.99,0,2023-01-02 08:00:00
...,...,...,...,...,...,...,...,...,...
33683,C343535530,castilloshannon@example.com,3,F,M1823072687,transportation,38.50,0,2023-01-23 16:00:00
33684,C343535530,castilloshannon@example.com,3,F,M1823072687,transportation,84.94,0,2023-01-24 00:00:00
33685,C343535530,castilloshannon@example.com,3,F,M1053599405,health,246.34,0,2023-01-24 08:00:00
33686,C343535530,castilloshannon@example.com,3,F,M1913465890,health,17.29,0,2023-01-24 16:00:00


### View Real Data in Dashboard

Link: [TBD]

### Goal: Create Synthetic Data with masked emails and more fraudulent transactions

#### Details

In [4]:
conn = rf.Connection.remote(
    'https://sunset-beach.rockfish.ai',
    'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpYXQiOjE3MTE2NDc0MDQsImlzcyI6ImFwaSIsIm5iZiI6MTcxMTY0NzQwNCwidG9rZW5faWQiOiIxd3pBUWliNjRVb0c2MWVUazQ4SzBMIiwidXNlcl9pZCI6IjQ2MVNUOXZ4a0hYekpYRnJKYm4yWm0ifQ.MxG4VB5IrXQ2U_2ePUaoEN7gfy2fqPhD5tzSYYhnn2k'
)

In [5]:
def get_rf_recommended_workflow(dataset, session_key, metadata_fields, privacy_requirements):
    dataset_properties = DatasetPropertyExtractor(dataset=dataset, session_key=session_key, metadata_fields=metadata_fields).extract()
    recommender_output = Recommender(dataset_properties=dataset_properties, steps=[ModelSelection(model_type=ModelType.TIME_GAN)]).run()
    train_action = recommender_output.actions[0]

    remap_actions = []
    for col_to_mask in privacy_requirements:
        remap = ra.Transform({"function": {"remap": ["fixed_mask", col_to_mask, {"mask_length": 8, "from_end": False}]}})

    train_wb = rf.WorkflowBuilder()
    train_wb.add_path(dataset, *remap_actions, train_action)
    return train_wb

In [12]:
async def get_story_data(model_id, n_sessions, story_requirements):
    dataset_properties = DatasetPropertyExtractor(dataset=dataset, session_key="customer", metadata_fields=["email", "age", "gender"]).extract()
    recommender_output = Recommender(dataset_properties=dataset_properties, steps=[ModelSelection(model_type=ModelType.TIME_GAN)]).run()
    generate = recommender_output.actions[1]

    model = rf.Model(model_id)

    post_amplify = ra.PostAmplify({
        "query_ast": {
            "eq": ["fraud", 1],
        },
        "drop_match_percentage": 0.0,
        "drop_other_percentage": 0.5,
    })
    session_target = ra.SessionTarget(target=n_sessions, max_cycles=100, use_match_count=True)
    save = ra.DatasetSave(name="synthetic", concat_session_key="session_key")
    
    builder = rf.WorkflowBuilder()
    builder.add_model(model)
    builder.add_action(generate, parents=[model, session_target])
    builder.add_action(post_amplify, parents=[generate])
    builder.add_action(session_target, parents=[post_amplify])
    builder.add_action(save, parents=[post_amplify])

    workflow = await builder.start(conn)
    
    syn_data = await workflow.datasets().concat(conn)
    return syn_data

#### Create Rockfish Model From Recommended Workflow

In [7]:
train_wb = get_rf_recommended_workflow(
    dataset, session_key="customer", metadata_fields=["email", "age", "gender"], privacy_requirements=["email"]
)
train_workflow = await train_wb.start(conn)

In [8]:
model_id = (await train_workflow.models().nth(0)).id

#### Generate Synthetic Data Using Rockfish Model

In [13]:
synthetic_dataset = await get_story_data(model_id, n_sessions=5000, story_requirements=["amplify_fraud"])

In [14]:
synthetic_dataset.to_pandas()

Unnamed: 0,timestamp,amount,email,age,gender,merchant,category,fraud,session_key
0,2023-01-23 23:44:32.255,286.637411,sarahmcdonald@example.com,3,F,M348934600,otherservices,1,1.0
1,2023-01-18 19:25:15.173,670.476548,adam24@example.com,4,F,M1823072687,travel,1,5.0
2,2023-01-25 17:03:03.849,294.708437,hmiller@example.com,4,M,M151143676,otherservices,1,31.0
3,2023-01-15 23:20:16.819,733.525577,jcrane@example.net,4,M,M1748431652,wellnessandbeauty,1,33.0
4,2023-01-23 11:01:04.758,323.843997,carl43@example.org,4,F,M348875670,leisure,1,38.0
...,...,...,...,...,...,...,...,...,...
1639,2023-01-27 11:54:58.719,223.292409,melissalam@example.com,2,F,M348934600,transportation,0,168.0
1640,2023-01-27 17:53:56.001,218.958819,melissalam@example.com,2,F,M348934600,transportation,0,168.0
1641,2023-01-27 23:40:22.985,209.473864,melissalam@example.com,2,F,M348934600,transportation,0,168.0
1642,2023-01-28 05:09:24.912,196.128776,melissalam@example.com,2,F,M348934600,transportation,0,168.0


In [15]:
synthetic_dataset.to_pandas().to_csv("finance_synthetic.csv", index=False)

### View Synthetic Data in Dashboard

Link: [TBD]