# DPML | Latency Replay

In this notebook, we investigate the reproducibility of transformation sequences captured by `dpml`.

## Load Dependencies

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from lineage import LeBatch
from lineage.transformation import DPMLClassWrapper, DPMLCallableWrapper
from lineage.utils import *

from sibyl import *
from datasets import concatenate_datasets, load_dataset

import os
import time
from tqdm.notebook import tqdm

## Create Datasets

In [41]:
dataset = load_dataset("glue", "sst2", split="train[:50000]")
dataset = dataset.rename_column('sentence', 'text')

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


## Replay Test

### Routine to be Tracked

In [55]:
len(scheduler.tran_df)

39

In [4]:
scheduler = SibylTransformScheduler("sentiment", class_wrapper=DPMLClassWrapper)
stochastic_list = [Concept2Sentence, ConceptMix, Emojify]

In [42]:
num_trials = 3
batch_size= 10

text, label = dataset['text'], dataset['label'] 
new_text, new_label = [], []

scheduler.num_INV = 1
scheduler.num_SIB = 1

transform_schedule = []
for i in tqdm(range(0, len(label), batch_size)):
    transforms = []
    for transform in scheduler.sample():
        if transform.wrapped_class in stochastic_list:
            continue
        transforms.append(transform)
    transform_schedule.append(transforms)

  0%|          | 0/5000 [00:00<?, ?it/s]

## Investigating CSV Replay Time / Memory Consumption

In [11]:
from time import perf_counter

class catchtime(object):
    def __init__(self, name="Code Block"):
        self.name = name
        
    def __enter__(self):
        self.t = time.perf_counter()
        return self

    def __exit__(self, type, value, traceback):
        self.t = time.perf_counter() - self.t
        print('{0:6.3f}s : {1}'.format(self.t, self.name))

In [22]:
def set_rng_state(fn, attr, state):
    rng_state = preprocess_params(state)
    random_generator = getattr(fn.func.__self__, attr)
    random_generator.__setstate__(rng_state)
    setattr(fn.func.__self__, attr, random_generator)
    return fn

def replay_all_from_csv():
      
    with catchtime("Load CSVTransformLogger") as t:
        from lineage.storage.csv.transform_logger import TransformLogger as CSVTransformLogger
    
    # fetch data
    with catchtime("Load data") as t:
        logger = CSVTransformLogger()
        df = pd.read_csv(logger.path, header=None, names=['batch_id', 'text', 'target', 'transform_prov'])
        transform_df = pd.read_csv(logger.transform_path, header=None, index_col=0, names=['transform_id', 'transform'])
    
    with catchtime("Load batches + transform_set") as t:
        transform_idxs = set()
        batches = {}
        for idx, row in df.iterrows():
            bid = row['batch_id']
            if bid not in batches:
                batches[bid] = {'text':[], 'target':[], 'transform': []}

            batches[bid]['text'].append(row['text'])
            batches[bid]['target'].append(row['target'])

            if len(batches[bid]['transform']) == 0:
                batches[bid]['transform'] = eval(row['transform_prov'])
                transform_idxs = transform_idxs | set(batches[bid]['transform'])
                    
    with catchtime("Load transforms") as t:
        transforms = []
        random_states = []
        hashes = []
        mapping = {}
        for idx in transform_idxs:
            t_prov = json.loads(transform_df.loc[idx]['transform'])
            random_state_attr = t_prov.pop('class_rng')
            random_state_info = t_prov.pop('callable_rng_state')
            random_states.append((random_state_attr, random_state_info))

            t_prov_hash = hash(repr(t_prov))
            if t_prov_hash not in hashes:
                transforms.append(load_transform_from_replay_provenance(t_prov))
                hashes.append(t_prov_hash)
                mapping[idx] = hashes.index(t_prov_hash)
            else:
                mapping[idx] = hashes.index(t_prov_hash)
    load_time = t.t

    # replay
    with catchtime("Replay") as t:
        new_records = []
        for batch_id in sorted(list(batches.keys())):
            batch = (batches[batch_id]['text'], batches[batch_id]['target'])
            for idx in batches[batch_id]['transform']:
                rs_attr, rs_info = random_states[idx]
                fn_id = mapping[idx]
                t_fn = set_rng_state(transforms[fn_id], rs_attr, rs_info)
                batch = t_fn(batch)
            texts, labels = batch
            new_records += [(x, y) for x,y in zip(texts, labels)]
    replay_time = t.t
            
    return new_records, load_time, replay_time

## Replay with CSV

In [43]:
csv_file_pth = "dpml/lineage/storage/dpml.csv"
if os.path.exists(csv_file_pth):
    os.remove(csv_file_pth)
if os.path.exists("dpml/lineage/storage/transform.csv"):
    os.remove("dpml/lineage/storage/transform.csv")

In [44]:
ls "dpml/lineage/storage/"

 Volume in drive C is Windows-SSD
 Volume Serial Number is DA58-C5DE

 Directory of C:\Users\Fabrice\Documents\GitHub\dpml\after\dpml\lineage\storage

08/10/2022  12:23 PM    <DIR>          .
08/09/2022  03:26 PM    <DIR>          ..
07/27/2022  01:16 PM               312 __init__.py
07/27/2022  01:16 PM    <DIR>          __pycache__
08/09/2022  03:26 PM    <DIR>          csv
08/09/2022  03:26 PM    <DIR>          sqlalchemy
               1 File(s)            312 bytes
               5 Dir(s)  346,171,846,656 bytes free


In [45]:
no_lineage_times = []
replay_logging_times, replay_fn_load_times, replay_generation_times, num_mismatches = [], [], [], []
for trial in tqdm(range(num_trials)):
    no_lineage_text, no_lineage_targets = [], []
    replay_log_text, replay_log_targets = [], []
    
    # no lineage ====================================================================================================
    startTime = time.perf_counter()
    for i, t_sched in zip(range(0, len(label), batch_size), transform_schedule):
        
        text_batch = text[i:i+batch_size]
        label_batch = label[i:i+batch_size]
        batch = (text_batch, label_batch)
        for transform in t_sched:
            batch = transform.transform_batch(batch)
            
        no_lineage_text.extend(batch[0])
        no_lineage_targets.extend(batch[1])
        
    run_time = time.perf_counter() - startTime
    no_lineage_times.append(run_time)
    print('Elapsed time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    # replay logging ================================================================================================
    startTime = time.perf_counter()
    for i, t_sched in zip(range(0, len(label), batch_size), transform_schedule):
        text_batch = text[i:i+batch_size]
        label_batch = label[i:i+batch_size]
        batch = (text_batch, label_batch)
        
        if len(t_sched) == 0:
            continue
            
        with LeBatch(original_batch=batch) as le_batch:
            init_rng_state = []
            for transform in t_sched:
                batch = le_batch.apply(batch, transform.transform_batch)
            
        replay_log_text.extend([x.text for x in batch])
        replay_log_targets.extend([x.target for x in batch])
            
    run_time = time.perf_counter() - startTime
    replay_logging_times.append(run_time)
    print('Elapsed logging time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    # replay generation ==============================================================================================
    startTime = time.perf_counter()
    new_records, load_time, replay_time = replay_all_from_csv()
    run_time = time.perf_counter() - startTime
    replay_fn_load_times.append(load_time)
    replay_generation_times.append(replay_time)
    print('Elapsed replay time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    original_records = [(text, target) for text, target in zip(replay_log_text, replay_log_targets)]
    num_mismatch = 0
    counter = 0
    for old_r, new_r in zip(original_records, new_records):
        if old_r[0] != new_r[0] or np.any(old_r[1] != new_r[1]):
            num_mismatch += 1  
        counter += 1
    num_mismatches.append(num_mismatch)
    print('Replay mismatches for Trial {0}: {1}'.format(trial, num_mismatch))   
    
    # del original_records, new_records
    
    le_batch.transform_logger.clean_data_store()

  0%|          | 0/3 [00:00<?, ?it/s]

Elapsed time for Trial 0: 167.340 seconds
Elapsed logging time for Trial 0: 187.849 seconds
 0.000s : Load CSVTransformLogger
 0.129s : Load data
 2.732s : Load batches + transform_set
 5.627s : Load transforms
155.464s : Replay
Elapsed replay time for Trial 0: 163.978 seconds
Replay mismatches for Trial 0: 0
Elapsed time for Trial 1: 157.105 seconds
Elapsed logging time for Trial 1: 188.200 seconds
 0.000s : Load CSVTransformLogger
 0.123s : Load data
 2.106s : Load batches + transform_set
 5.394s : Load transforms
158.944s : Replay
Elapsed replay time for Trial 1: 166.598 seconds
Replay mismatches for Trial 1: 0
Elapsed time for Trial 2: 161.151 seconds
Elapsed logging time for Trial 2: 194.913 seconds
 0.000s : Load CSVTransformLogger
 0.129s : Load data
 2.060s : Load batches + transform_set
 4.955s : Load transforms
158.782s : Replay
Elapsed replay time for Trial 2: 165.958 seconds
Replay mismatches for Trial 2: 0


In [46]:
print("no_lineage_times:", np.mean(no_lineage_times))
print("replay_logging_times:", np.mean(replay_logging_times))
print("replay_fn_load_times:", np.mean(replay_fn_load_times))
print("replay_generation_times:", np.mean(replay_generation_times))
print("num_mismatches:", np.mean(num_mismatches))

no_lineage_times: 161.86528426666675
replay_logging_times: 190.3204662666667
replay_fn_load_times: 5.325154599999905
replay_generation_times: 157.73009239999988
num_mismatches: 0.0


In [47]:
# new_records = replay_all_from_csv()
# 
# original_records = [(text, target) for text, target in zip(replay_log_text, replay_log_targets)]
# num_mismatch = 0
# counter = 0
# for old_r, new_r in zip(original_records, new_records):
#     if old_r[0] != new_r[0] or np.any(old_r[1] != new_r[1]):
#         num_mismatch += 1  
#     counter += 1
# num_mismatches.append(num_mismatch)
# print('Replay mismatches for Trial {0}: {1}'.format(trial, num_mismatch))   

In [48]:
# list(zip(original_records, new_records))

In [49]:
# for t1, t2 in list(zip(transform_idx.values(), new_transform_idx.values())):
#     print(t1.func.__self__.__class__.__name__)
#     print(t2.func.__self__.__class__.__name__)
#     print(t1.func.__self__.np_random.__getstate__())
#     print(t2.func.__self__.np_random.__getstate__())
#     print()

## Replay with SQL

In [19]:
no_lineage_times = []
replay_logging_times, replay_generation_times, num_mismatches = [], [], []
for trial in tqdm(range(num_trials)):
    no_lineage_text, no_lineage_targets = [], []
    replay_log_text, replay_log_targets = [], []
    
    # no lineage ====================================================================================================
    startTime = time.perf_counter()
    for i, t_sched in zip(range(0, len(label), batch_size), transform_schedule):
        
        text_batch = text[i:i+batch_size]
        label_batch = label[i:i+batch_size]
        batch = (text_batch, label_batch)
        for transform in t_sched:
            batch = transform.transform_batch(batch)
            
        no_lineage_text.extend(batch[0])
        no_lineage_targets.extend(batch[1])
        
    run_time = time.perf_counter() - startTime
    no_lineage_times.append(run_time)
    print('Elapsed time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    # replay logging ================================================================================================
    startTime = time.perf_counter()
    for i, t_sched in zip(range(0, len(label), batch_size), transform_schedule):
        text_batch = text[i:i+batch_size]
        label_batch = label[i:i+batch_size]
        batch = (text_batch, label_batch)
        
        if len(t_sched) == 0:
            continue
            
        with LeBatch(original_batch=batch) as le_batch:
            init_rng_state = []
            for transform in t_sched:
                batch = le_batch.apply(batch, transform.transform_batch)
            
        replay_log_text.extend([x.text for x in batch])
        replay_log_targets.extend([x.target for x in batch])
            
    run_time = time.perf_counter() - startTime
    replay_logging_times.append(run_time)
    print('Elapsed logging time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    # replay generation ==============================================================================================
    startTime = time.perf_counter()
    new_records = replay_all_from_db()
    run_time = time.perf_counter() - startTime
    replay_generation_times.append(run_time)
    print('Elapsed replay time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    original_records = [(text, target) for text, target in zip(replay_log_text, replay_log_targets)]
    num_mismatch = 0
    counter = 0
    for old_r, new_r in zip(original_records, new_records):
        if old_r[0] != new_r[0] or np.any(old_r[1] != new_r[1]):
            num_mismatch += 1  
        counter += 1
    num_mismatches.append(num_mismatch)
    print('Replay mismatches for Trial {0}: {1}'.format(trial, num_mismatch))    
    
    # truncate all table data
    # le_batch.transform_logger.clean_db()
        
#     if os.path.exists("./dpml/lineage/storage/dpml.db"):
#         os.remove("./dpml/lineage/storage/dpml.db")

  0%|          | 0/1 [00:00<?, ?it/s]

Elapsed time for Trial 0:  0.003 seconds
Elapsed logging time for Trial 0:  0.169 seconds
Elapsed replay time for Trial 0:  0.008 seconds
Replay mismatches for Trial 0: 5


In [12]:
from lineage.storage.sqlalchemy import *
from sqlalchemy import select

In [21]:
logger = TransformLogger()
    
print('Record')
stmt = select(Record)
with logger.engine.connect() as conn:
    for row in conn.execute(stmt):
        print(row._mapping)

print('Transform')
stmt = select(Transform)
with logger.engine.connect() as conn:
    for row in conn.execute(stmt):
        print(row._mapping)

print('TransformApplied')
stmt = select(TransformApplied)
with logger.engine.connect() as conn:
    for row in conn.execute(stmt):
        print(row._mapping)

Record
{'id': 1, 'text': 'hide new secretions from the parental units ', 'target': '0', 'created_at': datetime.datetime(2022, 8, 9, 22, 36, 38)}
{'id': 2, 'text': 'contains no wit , only labored gags ', 'target': '0', 'created_at': datetime.datetime(2022, 8, 9, 22, 36, 38)}
{'id': 3, 'text': 'that loves its characters and communicates something rather beautiful about human nature ', 'target': '1', 'created_at': datetime.datetime(2022, 8, 9, 22, 36, 38)}
{'id': 4, 'text': 'remains utterly satisfied to remain the same throughout ', 'target': '0', 'created_at': datetime.datetime(2022, 8, 9, 22, 36, 38)}
{'id': 5, 'text': 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ', 'target': '0', 'created_at': datetime.datetime(2022, 8, 9, 22, 36, 38)}
Transform
{'id': 1, 'module_name': 'sibyl.transformations.text.insertion.sentiment_phrase', 'class_name': 'InsertPositivePhrase', 'class_args': 'null', 'class_kwargs': '{"task_name": "sentiment", "return_metadata": true}', 'c

## Transformation Wrappers

In [26]:
type(t_orig().np_random)

numpy.random._generator.Generator

In [11]:
text = ["This is a test.", "This isn't a test!"]
target = [0, 1]
batch = (text, target)

t_orig = TRANSFORMATIONS[0]

In [12]:
print("DPMLClassWrapper")

t_class_wrapped = DPMLClassWrapper(t_orig)
t_class_wrapped = t_class_wrapped(task_name="sentiment", return_metadata=True)

batch = t_class_wrapped.transform_batch(batch)

print("DPMLClassWrapper | transform_batch")
print(batch)
print("_class_name:", t_class_wrapped._class_name)
print("_class_args:", t_class_wrapped._class_args)
print("_class_kwargs:", t_class_wrapped._class_kwargs)
print("_class_rng:", t_class_wrapped._class_rng)
print("_callable_name:", t_class_wrapped._callable_name)
print("_callable_args:", t_class_wrapped._callable_args)
print("_callable_kwargs:", t_class_wrapped._callable_kwargs)
print("_callable_rng_state:", t_class_wrapped._callable_rng_state)

X, y, meta = t_class_wrapped.transform_Xy(text[1], target[1])

print("DPMLClassWrapper | transform_Xy")
print(X, y)
print("_class_name:", t_class_wrapped._class_name)
print("_class_args:", t_class_wrapped._class_args)
print("_class_kwargs:", t_class_wrapped._class_kwargs)
print("_class_rng:", t_class_wrapped._class_rng)
print("_callable_name:", t_class_wrapped._callable_name)
print("_callable_args:", t_class_wrapped._callable_args)
print("_callable_kwargs:", t_class_wrapped._callable_kwargs)
print("_callable_rng_state:", t_class_wrapped._callable_rng_state)

DPMLClassWrapper
DPMLClassWrapper | transform_batch
(['This is a test.', 'This is not a test!'], [0, 1])
_class_name: ExpandContractions
_class_args: []
_class_kwargs: {'task_name': 'sentiment', 'return_metadata': True}
_class_rng: Generator(PCG64)
_callable_name: transform_batch
_callable_args: []
_callable_kwargs: []
_callable_rng_state: {'bit_generator': 'PCG64', 'state': {'state': 129413257090554225206130458028910539494, 'inc': 16450919397810582319219321886622321693}, 'has_uint32': 0, 'uinteger': 0}
DPMLClassWrapper | transform_Xy
This is not a test! 1
_class_name: ExpandContractions
_class_args: []
_class_kwargs: {'task_name': 'sentiment', 'return_metadata': True}
_class_rng: Generator(PCG64)
_callable_name: transform_Xy
_callable_args: []
_callable_kwargs: []
_callable_rng_state: {'bit_generator': 'PCG64', 'state': {'state': 129413257090554225206130458028910539494, 'inc': 16450919397810582319219321886622321693}, 'has_uint32': 0, 'uinteger': 0}


In [75]:
t_init = t_orig(task_name="sentiment", return_metadata=True)

t_callable_wrapped = DPMLCallableWrapper(t_init.transform_batch)
batch = t_callable_wrapped(batch)

print("DPMLCallableWrapper | transform_batch")
print(batch)
print("_callable_name", t_callable_wrapped._callable_name)
print("_callable_args", t_callable_wrapped._callable_args)
print("_callable_kwargs", t_callable_wrapped._callable_kwargs)

t_callable_wrapped = DPMLCallableWrapper(t_init.transform_Xy)
X, y, meta = t_callable_wrapped(text[1], target[1])

print("DPMLCallableWrapper | transform_Xy")
print(X, y)
print("_callable_name", t_callable_wrapped._callable_name)
print("_callable_args", t_callable_wrapped._callable_args)
print("_callable_kwargs", t_callable_wrapped._callable_kwargs)

DPMLCallableWrapper | transform_batch
(['hide new secretions from the parental units ', 'contains no wit , only labored gags '], [0, 0])
_callable_name ('transform_batch',)
_callable_args []
_callable_kwargs []
DPMLCallableWrapper | transform_Xy
contains no wit , only labored gags  1
_callable_name ('transform_Xy',)
_callable_args []
_callable_kwargs []
