In [12]:
from idlelib.window import add_windows_to_menu

import pandas as pd
from requests.packages import target
from rockfish import LogLevel


def sessionise_data(df, start_session =1):
    # Ensure the timestamp is in datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Sort the dataframe by timestamp
    df = df.sort_values(by='timestamp').reset_index(drop=True)
    
    def generate_sessions(df, window_size=24, step_size=1):
        sessions = []
        start_time = df['timestamp'].min()
        end_time = df['timestamp'].max()
        
        current_time = start_time
        session_id = 1
        
        while current_time + pd.Timedelta(hours=window_size) <= end_time:
            session_df = df[(df['timestamp'] >= current_time) & (df['timestamp'] < current_time + pd.Timedelta(hours=window_size))].copy()
            session_df['sessionID'] = session_id
            sessions.append(session_df)
            
            current_time += pd.Timedelta(hours=step_size)
            session_id += 1
        
        return pd.concat(sessions, ignore_index=True)
    
    # Generate sessions
    sessions_df = generate_sessions(df)
    return sessions_df

In [41]:
import pickle
import rockfish as rf

actions = pickle.load(open('runtime_conf.pkl', 'rb'))
conn = rf.Connection.from_config()

builder = rf.WorkflowBuilder()
builder.add_path(rf.Dataset.from_csv('demo_dt', 'location3.csv'), actions.actions['train-time-gan'])
workflow = await builder.start(conn)
workflow.id()

'26QpsZQbnNAyHrGXcfEhpz'

In [42]:
async for log in workflow.logs(level=rf.LogLevel.DEBUG):
    print(log)

2024-10-01T18:53:12Z dataset-load: INFO Loading dataset '2x4giarFuALd16x6dPop1D' with 12959 rows
2024-10-01T18:53:12Z train-time-gan: DEBUG CUDA available
2024-10-01T18:53:12Z train-time-gan: DEBUG Training on cuda:NVIDIA A100 80GB PCIe with cuda version 11.8
2024-10-01T18:53:13Z train-time-gan: INFO Starting DG training job
2024-10-01T18:53:14Z train-time-gan: DEBUG Epoch 1, g_loss_d:  0.3404, g_loss:  0.5948, d_loss_fake: -0.2351, d_loss_real: -2.8589, d_loss_gp:  0.1591, d_loss: -1.5027, g_loss_attr_d:  0.2544, attr_d_loss_fake: -0.1961, attr_d_loss_real:  0.0903, attr_d_loss_gp:  0.4961, attr_d_loss:  4.8551
2024-10-01T18:53:14Z train-time-gan: INFO Epoch 1 completed.
2024-10-01T18:53:15Z train-time-gan: DEBUG Epoch 2, g_loss_d:  1.0907, g_loss:  1.7502, d_loss_fake: -0.9312, d_loss_real: -9.0523, d_loss_gp:  0.0025, d_loss: -9.9582, g_loss_attr_d:  0.6595, attr_d_loss_fake: -0.5610, attr_d_loss_real:  0.2227, attr_d_loss_gp:  0.2557, attr_d_loss:  2.2187
2024-10-01T18:53:15Z train

In [43]:
import rockfish as rf
conn = rf.Connection.from_config()
model = await conn.list_models().last()
model

Model(id='78489743-8027-11ef-9e39-ce6cd7fc8a2f', labels={'workflow_id': '26QpsZQbnNAyHrGXcfEhpz'}, create_time=datetime.datetime(2024, 10, 1, 19, 0, 48, tzinfo=datetime.timezone.utc), size_bytes=773852160)

In [49]:
builder = rf.WorkflowBuilder()
import rockfish.actions as ra
conn_local = rf.Connection.local()
gen = ra.GenerateTimeGAN()
gen.config().doppelganger.sessions = 300
builder.add_path(model, gen, ra.DatasetSave(name='synthetic_data'))
workflow = await builder.start(conn)
workflow.id()

'5TXllf0hCRMUz9YSNPUhgO'

In [50]:
async for log in workflow.logs(level=rf.LogLevel.DEBUG):
    print(log)

2024-10-01T19:04:24Z generate-time-gan: DEBUG CUDA available
2024-10-01T19:04:24Z generate-time-gan: DEBUG Running on cuda:NVIDIA A100 80GB PCIe with cuda version 11.8
2024-10-01T19:04:24Z generate-time-gan: INFO Downloading model with model_id='78489743-8027-11ef-9e39-ce6cd7fc8a2f'...
2024-10-01T19:05:07Z generate-time-gan: INFO Generating 300 sessions...
2024-10-01T19:05:07Z generate-time-gan: DEBUG Generating data from model trained with 400 epochs
2024-10-01T19:05:08Z generate-time-gan: DEBUG DG generating 3 batches
2024-10-01T19:05:08Z dataset-save: INFO using field 'session_key' to concatenate tables
2024-10-01T19:05:08Z dataset-save: INFO Saved dataset '7EyXBb2oo0SWany8nIj7q0' with 12220 rows


In [51]:
syn = await (await workflow.datasets().last()).to_local(conn)

In [52]:
syn.to_pandas()

Unnamed: 0,timestamp,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,session_key
0,2023-08-11 17:45:29.858,0.743775,0.739928,0.553015,0.504247,0.434376,0.377533,0.451454,0.501793,0.406757,...,0.391676,0.571217,0.378163,0.585993,0.028138,0.012182,0.210496,0.146738,0.176767,0.0
1,2023-08-11 17:46:17.853,0.748679,0.771635,0.546615,0.513607,0.473839,0.436898,0.464028,0.522947,0.403598,...,0.450299,0.581647,0.400623,0.634794,0.030192,0.015059,0.216867,0.169075,0.173937,0.0
2,2023-08-11 17:46:59.447,0.753074,0.745576,0.542101,0.512637,0.428325,0.372761,0.481765,0.472585,0.403362,...,0.404311,0.572979,0.390519,0.620915,0.020689,0.007702,0.203235,0.149445,0.169989,0.0
3,2023-08-11 17:47:46.101,0.728998,0.728102,0.552418,0.512928,0.449804,0.393011,0.470144,0.503008,0.425315,...,0.427691,0.570616,0.373299,0.598019,0.020219,0.010133,0.213074,0.149371,0.172481,0.0
4,2023-08-11 17:48:24.155,0.760513,0.732318,0.547497,0.510298,0.438494,0.387712,0.443159,0.521591,0.422987,...,0.452018,0.575668,0.381024,0.626848,0.020990,0.009654,0.207390,0.149048,0.168949,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12215,2023-08-12 07:35:50.380,0.788435,0.744413,0.568869,0.536014,0.392638,0.405175,0.448745,0.418443,0.378811,...,0.399989,0.572596,0.321561,0.592512,0.022526,0.008747,0.199671,0.130119,0.157239,299.0
12216,2023-08-12 07:36:26.989,0.800033,0.747428,0.571090,0.548128,0.402471,0.438019,0.437782,0.388827,0.381272,...,0.391165,0.568946,0.310666,0.588857,0.021378,0.009178,0.197656,0.132575,0.153178,299.0
12217,2023-08-12 07:37:03.588,0.781050,0.740440,0.571230,0.538363,0.393750,0.413550,0.465575,0.403186,0.386472,...,0.396799,0.577317,0.309316,0.597653,0.021600,0.008427,0.197181,0.143216,0.148328,299.0
12218,2023-08-12 07:37:38.134,0.791409,0.746348,0.576895,0.529996,0.397531,0.425977,0.445000,0.407673,0.386990,...,0.405383,0.589165,0.313388,0.590919,0.026496,0.014135,0.208790,0.139149,0.150681,299.0


In [53]:
syn.to_pandas().to_csv('test/syn_data.csv', index=False)

In [31]:
import pandas as pd
l1 = pd.read_csv('location1.csv')

In [34]:
l1=l1.drop_duplicates(subset=['timestamp'])

In [35]:
l2 = pd.read_csv('location2.csv')
l2=l2.drop_duplicates(subset=['timestamp'])

In [36]:
l3 = pd.read_csv('location3.csv')
l3=l3.drop_duplicates(subset=['timestamp'])

In [37]:
syn = pd.read_csv('syn_data.csv')
syn = syn.drop_duplicates(subset=['timestamp'])

In [40]:
l1.to_csv('test/location1.csv', index=False)
l2.to_csv('test/location2.csv', index=False)
l3.to_csv('test/location3.csv', index=False)
syn.to_csv('test/syn_data.csv', index=False)