In [12]:
import pandas as pd
from requests.packages import target
from rockfish import LogLevel


def sessionise_data(df, start_session =1):
    # Ensure the timestamp is in datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Sort the dataframe by timestamp
    df = df.sort_values(by='timestamp').reset_index(drop=True)
    
    def generate_sessions(df, window_size=24, step_size=1):
        sessions = []
        start_time = df['timestamp'].min()
        end_time = df['timestamp'].max()
        
        current_time = start_time
        session_id = 1
        
        while current_time + pd.Timedelta(hours=window_size) <= end_time:
            session_df = df[(df['timestamp'] >= current_time) & (df['timestamp'] < current_time + pd.Timedelta(hours=window_size))].copy()
            session_df['sessionID'] = session_id
            sessions.append(session_df)
            
            current_time += pd.Timedelta(hours=step_size)
            session_id += 1
        
        return pd.concat(sessions, ignore_index=True)
    
    # Generate sessions
    sessions_df = generate_sessions(df)
    return sessions_df

In [20]:
df = pd.read_csv('location3.csv')

In [21]:
df = sessionise_data(df)

In [22]:
df.to_csv('location3.csv',index=False)

In [23]:
df.columns

Index(['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14',
       'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
       'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24',
       'timestamp', 'sessionID'],
      dtype='object')

In [24]:
df = pd.read_csv('location1.csv').iloc[:10000]

In [25]:
df.to_csv('onboarding_sample.csv', index=False)

In [26]:
import pickle
q = pickle.load(open('runtime_conf.pkl', 'rb'))

In [37]:
from actions.dg.train import TrainTimeGAN
r = TrainTimeGAN(rf.converter.unstructure(q.actions['train-time-gan'].config()))

In [38]:
r

<actions.dg.train.TrainTimeGAN at 0x38f1cec10>

In [39]:
q

<rockfish.workflow.WorkflowBuilder at 0x349d77f50>

In [41]:
q.actions['train-time-gan'] = r

In [43]:
q.actions

{'datastream-load': <rockfish.actions.datastream.DatastreamLoad at 0x33ee895d0>,
 'train-time-gan': <actions.dg.train.TrainTimeGAN at 0x38f1cec10>}

In [44]:
pickle.dump(q, open('runtime_conf.pkl', 'wb'))

In [45]:
r

<actions.dg.train.TrainTimeGAN at 0x38f1cec10>

In [53]:
from rockfish.events import LogLevel
builder = rf.WorkflowBuilder()
builder.add_path(rf.Dataset.from_csv('he', 'location1.csv'), r)
workflow = await builder.start(rf.Connection.local())
async for log in workflow.logs(level=LogLevel.DEBUG):
    print(log)

2024-09-26T20:18:55.357578Z dataset-load: INFO Loading dataset 'Y07byMzJiv0Z4VugaHeRL' with 1002240 rows
2024-09-26T20:18:55.389857Z train-time-gan: DEBUG Training on mps:1
2024-09-26T20:19:00.588911Z train-time-gan: INFO Starting DG training job
2024-09-26T20:19:13.548179Z train-time-gan: DEBUG Epoch 1, g_loss_d: -3.8544, g_loss: -2.7139, d_loss_fake:  3.9665, d_loss_real: -1292.2047, d_loss_gp:  59.7385, d_loss: -690.8535, g_loss_attr_d:  1.1405, attr_d_loss_fake: -1.0972, attr_d_loss_real:  0.3274, attr_d_loss_gp:  0.0143, attr_d_loss: -0.6272
2024-09-26T20:19:13.551315Z train-time-gan: INFO Epoch 1 completed.
2024-09-26T20:19:24.267152Z train-time-gan: DEBUG Epoch 2, g_loss_d: -3.7732, g_loss: -1.1264, d_loss_fake:  3.7545, d_loss_real: -1175.9513, d_loss_gp:  51.4496, d_loss: -657.7007, g_loss_attr_d:  2.6468, attr_d_loss_fake: -2.6076, attr_d_loss_real:  1.3635, attr_d_loss_gp:  0.0038, attr_d_loss: -1.2058
2024-09-26T20:19:24.269799Z train-time-gan: INFO Epoch 2 completed.
2024-

CancelledError: 

In [51]:
await workflow.stop()

In [56]:
df = pd.read_csv('location1.csv')

In [48]:
df.fillna(0, inplace=True)

In [49]:
df.to_csv('location1.csv', index=False)

In [65]:
conn = rf.Connection.from_config()

In [67]:
(await conn.get_workflow('7fiQoVvb9HoDjaCATk10Nb')).stop()

<coroutine object Workflow.stop at 0x14f3e7140>

In [57]:
df.sessionID.max()

696

In [58]:
df[df.sessionID == 100].index

Index([142560, 142561, 142562, 142563, 142564, 142565, 142566, 142567, 142568,
       142569,
       ...
       143990, 143991, 143992, 143993, 143994, 143995, 143996, 143997, 143998,
       143999],
      dtype='int64', length=1440)

In [59]:
len(df)

1002240

In [64]:
df.iloc[:1000].to_csv('locationx.csv', index=False)

In [67]:
import pickle
import rockfish as rf
from actions.dg.train import TrainTimeGAN
rt = pickle.load(open('runtime_conf.pkl', 'rb'))

In [68]:
conn = rf.Connection.local()

In [69]:
train = rf.converter.unstructure(rt.actions['train-time-gan'].config())
train = TrainTimeGAN(train)

In [70]:
builder = rf.WorkflowBuilder()
builder.add_path(rf.Dataset.from_csv('s', 'location3.csv'), train)
workflow = await builder.start(conn)

In [71]:
async for log in workflow.logs(level = rf.LogLevel.DEBUG):
    print(log)

2024-09-27T18:16:11.662390Z dataset-load: INFO Loading dataset '1evC1ffDIoIyWIbdp2TfiB' with 1038240 rows
2024-09-27T18:16:11.692686Z train-time-gan: DEBUG Training on mps:1
2024-09-27T18:16:17.029891Z train-time-gan: INFO Starting DG training job
2024-09-27T18:16:29.097754Z train-time-gan: DEBUG Epoch 1, g_loss_d: -3.5988, g_loss: -2.5418, d_loss_fake:  3.7657, d_loss_real: -1276.4741, d_loss_gp:  56.4749, d_loss: -707.9590, g_loss_attr_d:  1.0570, attr_d_loss_fake: -0.9970, attr_d_loss_real:  0.1558, attr_d_loss_gp:  0.0222, attr_d_loss: -0.6192
2024-09-27T18:16:29.100857Z train-time-gan: INFO Epoch 1 completed.
2024-09-27T18:16:38.863371Z train-time-gan: DEBUG Epoch 2, g_loss_d: -3.4925, g_loss: -0.5978, d_loss_fake:  3.5341, d_loss_real: -1124.4144, d_loss_gp:  49.0503, d_loss: -630.3773, g_loss_attr_d:  2.8947, attr_d_loss_fake: -2.8490, attr_d_loss_real:  1.3018, attr_d_loss_gp:  0.0051, attr_d_loss: -1.4959
2024-09-27T18:16:38.865829Z train-time-gan: INFO Epoch 2 completed.
2024

In [72]:
model = await workflow.models().last()

In [73]:
model

Model(id='1MvOnVFUAqUGKG29OwQFyD', labels={}, create_time=None, size_bytes=None)

In [74]:
from actions.dg.generate import GenerateTimeGAN
import rockfish.actions as ra
builder = rf.WorkflowBuilder()
builder.add_path(model, GenerateTimeGAN(), ra.DatasetSave(name='syn'))
workflow = await builder.start(conn)
async for log in workflow.logs(level=rf.LogLevel.DEBUG):
    print(log)

2024-09-27T18:50:50.351250Z generate-time-gan: DEBUG CUDA not available
2024-09-27T18:50:50.352286Z generate-time-gan: INFO Downloading model with model_id='1MvOnVFUAqUGKG29OwQFyD'...
2024-09-27T18:50:50.673370Z generate-time-gan: INFO Generating 1000 sessions...
2024-09-27T18:50:51.424796Z generate-time-gan: DEBUG Generating data from model trained with 200 epochs


  state = torch.load(model_path, map_location=self.device)


2024-09-27T18:50:52.374174Z generate-time-gan: DEBUG DG generating 9 batches
2024-09-27T18:50:55.425106Z dataset-save: INFO using field 'session_key' to concatenate tables
2024-09-27T18:50:55.425665Z dataset-save: INFO Saved dataset '2esZxZVCdDMwH9CBkdwYwL' with 1450000 rows


In [75]:
syn_data = await (await workflow.datasets().concat(conn)).to_local()

In [76]:
syn_data.to_pandas()

Unnamed: 0,timestamp,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,session_key
0,2023-08-25 20:21:58.627,0.770009,0.834152,0.606762,0.581446,0.471956,0.409995,0.465374,0.453411,0.418084,...,0.456487,0.616557,0.396816,0.602092,0.064566,0.018603,0.230886,0.025676,0.183125,0.0
1,2023-08-25 20:22:51.818,0.792195,0.828788,0.632858,0.578591,0.469476,0.436888,0.479058,0.440931,0.436340,...,0.448495,0.618604,0.397907,0.608816,0.059507,0.019224,0.217730,0.024848,0.184619,0.0
2,2023-08-25 20:23:43.114,0.782756,0.834185,0.616259,0.589215,0.444085,0.422867,0.495700,0.434247,0.456391,...,0.438327,0.600982,0.369038,0.611104,0.068542,0.018069,0.222513,0.028155,0.204036,0.0
3,2023-08-25 20:24:35.547,0.777797,0.831336,0.622582,0.581828,0.467977,0.402302,0.482502,0.452028,0.487276,...,0.460402,0.612609,0.398075,0.609185,0.063363,0.021988,0.217919,0.028443,0.195949,0.0
4,2023-08-25 20:25:26.370,0.766406,0.829849,0.622777,0.586063,0.491225,0.422437,0.473921,0.428229,0.417282,...,0.475370,0.592423,0.402056,0.613827,0.064094,0.017075,0.211074,0.026135,0.196993,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1449995,2023-08-04 20:39:45.419,0.770727,0.833024,0.557189,0.625313,0.525870,0.525427,0.551613,0.547947,0.517053,...,0.482994,0.611122,0.401736,0.659610,0.019801,0.010445,0.213378,0.257665,0.227950,999.0
1449996,2023-08-04 20:40:45.718,0.770893,0.833707,0.557588,0.625203,0.528022,0.526403,0.552776,0.547165,0.515614,...,0.483517,0.610663,0.399643,0.659272,0.019736,0.010522,0.212775,0.258671,0.225771,999.0
1449997,2023-08-04 20:41:45.794,0.771268,0.833628,0.557381,0.625232,0.523252,0.532679,0.552870,0.546473,0.516114,...,0.483943,0.610473,0.400360,0.658033,0.020050,0.010289,0.210844,0.259752,0.228345,999.0
1449998,2023-08-04 20:42:45.378,0.770453,0.832861,0.557619,0.625206,0.528001,0.526966,0.553787,0.549341,0.515284,...,0.482384,0.609188,0.401394,0.657388,0.020123,0.010420,0.211185,0.260265,0.226090,999.0


In [77]:
ogdat = rf.Dataset.from_csv('g', 'location3.csv')
col_names = ogdat.table.column_names
col_names = {col:col for col in col_names}
col_names['sessionID'] = 'session_key'
print(col_names)
ogdat.table = ogdat.table.rename_columns(list(col_names.values()))
ogdat.table

{'feature_0': 'feature_0', 'feature_1': 'feature_1', 'feature_2': 'feature_2', 'feature_3': 'feature_3', 'feature_4': 'feature_4', 'feature_5': 'feature_5', 'feature_6': 'feature_6', 'feature_7': 'feature_7', 'feature_8': 'feature_8', 'feature_9': 'feature_9', 'feature_10': 'feature_10', 'feature_11': 'feature_11', 'feature_12': 'feature_12', 'feature_13': 'feature_13', 'feature_14': 'feature_14', 'feature_15': 'feature_15', 'feature_16': 'feature_16', 'feature_17': 'feature_17', 'feature_18': 'feature_18', 'feature_19': 'feature_19', 'feature_20': 'feature_20', 'feature_21': 'feature_21', 'feature_22': 'feature_22', 'feature_23': 'feature_23', 'feature_24': 'feature_24', 'timestamp': 'timestamp', 'sessionID': 'session_key'}


pyarrow.Table
feature_0: double
feature_1: double
feature_2: double
feature_3: double
feature_4: double
feature_5: double
feature_6: double
feature_7: double
feature_8: double
feature_9: double
feature_10: double
feature_11: double
feature_12: double
feature_13: double
feature_14: double
feature_15: double
feature_16: double
feature_17: double
feature_18: double
feature_19: double
feature_20: double
feature_21: double
feature_22: double
feature_23: double
feature_24: double
timestamp: timestamp[s]
session_key: int64
----
feature_0: [[0.7105338932441083,0.7104610637648477,0.7105518581893833,0.7107960066835955,0.7106657983459652,...,0.7655136596254362,0.7658230830026626,0.7659973077912131,0.76617326960971,0.7661338229028846],[0.7663005937594882,0.7662507609875995,0.7665008200328671,0.7663634493784262,0.7665331430281597,...,0.7008318584939333,0.7008063227631017,0.7009072720361443,0.7350970649729197,0.7352160209098437],...,[0.7602535172147298,0.7601045431836427,0.7602710572943935,0.7602593

In [78]:
import rockfish.labs as rl
rl.metrics.marginal_dist_score(dataset=ogdat, syn=syn_data)

0.8591000054039627

In [79]:
syn_data.to_pandas().to_csv('syn_data.csv', index=False)

In [80]:
conn_rem = rf.Connection.from_config()

In [85]:
async for model in conn_rem.list_models(labels={'kind':'location3.csv'}):
    print(model)

Model(id='0a81be52-7d04-11ef-aaaa-26425b76ab20', labels={'kind': 'location3.csv', 'workflow_id': '20W36Hm78dCyiFyHIfbG5X'}, create_time=datetime.datetime(2024, 9, 27, 19, 9, 38, tzinfo=datetime.timezone.utc), size_bytes=4053121536)
