# DPML | Latency Replay

In this notebook, we investigate the reproducibility of transformation sequences captured by `dpml`.

## Load Dependencies

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from lineage import LeBatch
from lineage.transformation import DPMLClassWrapper, DPMLCallableWrapper
from lineage.utils import *

from sibyl import *
from datasets import concatenate_datasets, load_dataset

import os
import time
from tqdm.notebook import tqdm

## Create Datasets

In [3]:
dataset = load_dataset("glue", "sst2", split="train[:500]")
dataset = dataset.rename_column('sentence', 'text')

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


## Replay Test

### Routine to be Tracked

In [4]:
scheduler = SibylTransformScheduler("sentiment", class_wrapper=DPMLClassWrapper)
stochastic_list = [Concept2Sentence, ConceptMix, Emojify]

In [5]:
num_trials = 1
batch_size= 5

text, label = dataset['text'], dataset['label'] 
new_text, new_label = [], []

scheduler.num_INV = 1
scheduler.num_SIB = 1

transform_schedule = []
for i in tqdm(range(0, len(label), batch_size)):
    transforms = []
    for transform in scheduler.sample():
        if transform.wrapped_class in stochastic_list:
            continue
        transforms.append(transform)
    transform_schedule.append(transforms)

  0%|          | 0/100 [00:00<?, ?it/s]

## Replay with CSV

In [6]:
csv_file_pth = "dpml/lineage/storage/dpml.csv"
if os.path.exists(csv_file_pth):
    os.remove(csv_file_pth)
if os.path.exists("dpml/lineage/storage/transform.csv"):
    os.remove("dpml/lineage/storage/transform.csv")

In [7]:
ls "dpml/lineage/storage/"

 Volume in drive C is Windows-SSD
 Volume Serial Number is DA58-C5DE

 Directory of C:\Users\Fabrice\Documents\GitHub\dpml\after\dpml\lineage\storage

08/09/2022  09:18 PM    <DIR>          .
08/09/2022  03:26 PM    <DIR>          ..
07/27/2022  01:16 PM               312 __init__.py
07/27/2022  01:16 PM    <DIR>          __pycache__
08/09/2022  03:26 PM    <DIR>          csv
08/09/2022  03:26 PM    <DIR>          sqlalchemy
               1 File(s)            312 bytes
               5 Dir(s)  345,019,441,152 bytes free


In [8]:
no_lineage_times = []
replay_logging_times, replay_generation_times, num_mismatches = [], [], []
for trial in tqdm(range(num_trials)):
    no_lineage_text, no_lineage_targets = [], []
    replay_log_text, replay_log_targets = [], []
    
    # no lineage ====================================================================================================
    startTime = time.perf_counter()
    for i, t_sched in zip(range(0, len(label), batch_size), transform_schedule):
        
        text_batch = text[i:i+batch_size]
        label_batch = label[i:i+batch_size]
        batch = (text_batch, label_batch)
        for transform in t_sched:
            batch = transform.transform_batch(batch)
            
        no_lineage_text.extend(batch[0])
        no_lineage_targets.extend(batch[1])
        
    run_time = time.perf_counter() - startTime
    no_lineage_times.append(run_time)
    print('Elapsed time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    # replay logging ================================================================================================
    startTime = time.perf_counter()
    for i, t_sched in zip(range(0, len(label), batch_size), transform_schedule):
        text_batch = text[i:i+batch_size]
        label_batch = label[i:i+batch_size]
        batch = (text_batch, label_batch)
        
        if len(t_sched) == 0:
            continue
            
        with LeBatch(original_batch=batch) as le_batch:
            init_rng_state = []
            for transform in t_sched:
                batch = le_batch.apply(batch, transform.transform_batch)
            
        replay_log_text.extend([x.text for x in batch])
        replay_log_targets.extend([x.target for x in batch])
            
    run_time = time.perf_counter() - startTime
    replay_logging_times.append(run_time)
    print('Elapsed logging time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    # replay generation ==============================================================================================
    startTime = time.perf_counter()
    new_records = replay_all_from_csv()
    run_time = time.perf_counter() - startTime
    replay_generation_times.append(run_time)
    print('Elapsed replay time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    original_records = [(text, target) for text, target in zip(replay_log_text, replay_log_targets)]
    num_mismatch = 0
    counter = 0
    for old_r, new_r in zip(original_records, new_records):
        if old_r[0] != new_r[0] or np.any(old_r[1] != new_r[1]):
            num_mismatch += 1  
        counter += 1
    num_mismatches.append(num_mismatch)
    print('Replay mismatches for Trial {0}: {1}'.format(trial, num_mismatch))    
    
    # le_batch.transform_logger.clean_data_store()

  0%|          | 0/1 [00:00<?, ?it/s]

Elapsed time for Trial 0:  2.305 seconds
Elapsed logging time for Trial 0:  1.830 seconds
Elapsed replay time for Trial 0: 14.945 seconds
Replay mismatches for Trial 0: 0


In [9]:
print("no_lineage_times:", np.mean(no_lineage_times))
print("replay_logging_times:", np.mean(replay_logging_times))
print("replay_generation_times:", np.mean(replay_generation_times))
print("num_mismatches:", np.mean(num_mismatches))

no_lineage_times: 2.3047605999999945
replay_logging_times: 1.829979999999999
replay_generation_times: 14.9451417
num_mismatches: 0.0


## Investigating CSV Replay Time / Memory Consumption

In [10]:
from time import perf_counter

class catchtime(object):
    def __init__(self, name="Code Block"):
        self.name = name
        
    def __enter__(self):
        self.t = time.perf_counter()
        return self

    def __exit__(self, type, value, traceback):
        self.t = time.perf_counter() - self.t
        print('{0:6.3f}s : {1}'.format(self.t, self.name))

In [20]:
import copy

def find_initialized_tran_fn(transform_idx, t_prov):
    for k, v in transform_idx.items():
        if t_prov['class_name'] == v.func.__self__.__class__.__name__:
            return v

def replay_all_from_csv():
      
    with catchtime("Load CSVTransformLogger") as t:
        from lineage.storage.csv.transform_logger import TransformLogger as CSVTransformLogger
    
    # fetch data
    with catchtime("Load data") as t:
        logger = CSVTransformLogger()
        df = pd.read_csv(logger.path, header=None, names=['batch_id', 'text', 'target', 'transform_prov'])
        transform_df = pd.read_csv(logger.transform_path, header=None, index_col=0, names=['transform_id', 'transform'])
    
    with catchtime("Load batches + transform_set") as t:
        transform_set = set()
        batches = {}
        for idx, row in df.iterrows():
            bid = row['batch_id']
            if bid not in batches:
                batches[bid] = {'text':[], 'target':[], 'transform': []}

            batches[bid]['text'].append(row['text'])
            batches[bid]['target'].append(row['target'])

            if len(batches[bid]['transform']) == 0:
                batches[bid]['transform'] = eval(row['transform_prov'])
                transform_set = transform_set | set(batches[bid]['transform'])
                
    
    with catchtime("Load transforms") as t:
        new_transform_idx = {}
        for idx in transform_set:
            t_prov = json.loads(transform_df.loc[idx]['transform'])
            t_fn = find_initialized_tran_fn(new_transform_idx, t_prov)
            if t_fn:
                t_fn = copy.deepcopy(t_fn)
                rng_state = preprocess_params(t_prov['callable_rng_state'])
                random_generator = getattr(t_fn.func.__self__, t_prov['class_rng'])
                random_generator.__setstate__(rng_state)
                setattr(t_fn.func.__self__, t_prov['class_rng'], random_generator)
                new_transform_idx[idx] = t_fn
            else:
                new_transform_idx[idx] = load_transform_from_replay_provenance(t_prov)
                   

#     with catchtime("Load transforms") as t:
#         transform_idx = {}
#         for idx in transform_set:
#             t_prov = json.loads(transform_df.loc[idx]['transform'])
#             t_fn = copy.deepcopy(find_initialized_tran_fn(transform_idx, t_prov))
#             if not t_fn:
#                 t_fn = load_transform_from_replay_provenance(t_prov)
#             rng_state = preprocess_params(t_prov['callable_rng_state'])
#             random_generator = getattr(t_fn.func.__self__, t_prov['class_rng'])
#             random_generator.__setstate__(rng_state)
#             setattr(t_fn.func.__self__, t_prov['class_rng'], random_generator)
#             transform_idx[idx] = t_fn

    # replay
    with catchtime("Replay") as t:
        new_records = []
        for batch_id in sorted(list(batches.keys())):
            batch = (batches[batch_id]['text'], batches[batch_id]['target'])
            for t_fn_id in batches[batch_id]['transform']:
                t_fn = transform_idx[t_fn_id]
                batch = t_fn(batch)
            texts, labels = batch
            new_records += [(x, y) for x,y in zip(texts, labels)]
    return new_records

In [21]:
new_records = replay_all_from_csv()

 0.000s : Load CSVTransformLogger
 0.006s : Load data
 0.019s : Load batches + transform_set
13.668s : Load transforms
 1.451s : Replay


In [22]:
original_records = [(text, target) for text, target in zip(replay_log_text, replay_log_targets)]
num_mismatch = 0
counter = 0
for old_r, new_r in zip(original_records, new_records):
    if old_r[0] != new_r[0] or np.any(old_r[1] != new_r[1]):
        num_mismatch += 1  
    counter += 1
num_mismatches.append(num_mismatch)
print('Replay mismatches for Trial {0}: {1}'.format(trial, num_mismatch))   

Replay mismatches for Trial 0: 380


In [63]:
for t1, t2 in list(zip(transform_idx.values(), new_transform_idx.values())):
    print(t1.func.__self__.__class__.__name__)
    print(t2.func.__self__.__class__.__name__)
    print(t1.func.__self__.np_random.__getstate__())
    print(t2.func.__self__.np_random.__getstate__())
    print()

SentMix
SentMix
{'bit_generator': 'PCG64', 'state': {'state': 64698817949135605219087001682552842832, 'inc': 16450919397810582319219321886622321693}, 'has_uint32': 0, 'uinteger': 1502305844}
{'bit_generator': 'PCG64', 'state': {'state': 95408361110169272194908061357520512207, 'inc': 16450919397810582319219321886622321693}, 'has_uint32': 1, 'uinteger': 1450944720}

ChangeSynonym
ChangeSynonym
{'bit_generator': 'PCG64', 'state': {'state': 261114468015768180822257607760104456251, 'inc': 16450919397810582319219321886622321693}, 'has_uint32': 0, 'uinteger': 355978622}
{'bit_generator': 'PCG64', 'state': {'state': 231633822897973812995342589916072236840, 'inc': 16450919397810582319219321886622321693}, 'has_uint32': 1, 'uinteger': 375110666}

InsertPositivePhrase
InsertPositivePhrase
{'bit_generator': 'PCG64', 'state': {'state': 211229263477617072728739240846717366882, 'inc': 16450919397810582319219321886622321693}, 'has_uint32': 0, 'uinteger': 676651057}
{'bit_generator': 'PCG64', 'state': {

## Replay with SQL

In [19]:
no_lineage_times = []
replay_logging_times, replay_generation_times, num_mismatches = [], [], []
for trial in tqdm(range(num_trials)):
    no_lineage_text, no_lineage_targets = [], []
    replay_log_text, replay_log_targets = [], []
    
    # no lineage ====================================================================================================
    startTime = time.perf_counter()
    for i, t_sched in zip(range(0, len(label), batch_size), transform_schedule):
        
        text_batch = text[i:i+batch_size]
        label_batch = label[i:i+batch_size]
        batch = (text_batch, label_batch)
        for transform in t_sched:
            batch = transform.transform_batch(batch)
            
        no_lineage_text.extend(batch[0])
        no_lineage_targets.extend(batch[1])
        
    run_time = time.perf_counter() - startTime
    no_lineage_times.append(run_time)
    print('Elapsed time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    # replay logging ================================================================================================
    startTime = time.perf_counter()
    for i, t_sched in zip(range(0, len(label), batch_size), transform_schedule):
        text_batch = text[i:i+batch_size]
        label_batch = label[i:i+batch_size]
        batch = (text_batch, label_batch)
        
        if len(t_sched) == 0:
            continue
            
        with LeBatch(original_batch=batch) as le_batch:
            init_rng_state = []
            for transform in t_sched:
                batch = le_batch.apply(batch, transform.transform_batch)
            
        replay_log_text.extend([x.text for x in batch])
        replay_log_targets.extend([x.target for x in batch])
            
    run_time = time.perf_counter() - startTime
    replay_logging_times.append(run_time)
    print('Elapsed logging time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    # replay generation ==============================================================================================
    startTime = time.perf_counter()
    new_records = replay_all_from_db()
    run_time = time.perf_counter() - startTime
    replay_generation_times.append(run_time)
    print('Elapsed replay time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    original_records = [(text, target) for text, target in zip(replay_log_text, replay_log_targets)]
    num_mismatch = 0
    counter = 0
    for old_r, new_r in zip(original_records, new_records):
        if old_r[0] != new_r[0] or np.any(old_r[1] != new_r[1]):
            num_mismatch += 1  
        counter += 1
    num_mismatches.append(num_mismatch)
    print('Replay mismatches for Trial {0}: {1}'.format(trial, num_mismatch))    
    
    # truncate all table data
    # le_batch.transform_logger.clean_db()
        
#     if os.path.exists("./dpml/lineage/storage/dpml.db"):
#         os.remove("./dpml/lineage/storage/dpml.db")

  0%|          | 0/1 [00:00<?, ?it/s]

Elapsed time for Trial 0:  0.003 seconds
Elapsed logging time for Trial 0:  0.169 seconds
Elapsed replay time for Trial 0:  0.008 seconds
Replay mismatches for Trial 0: 5


In [12]:
from lineage.storage.sqlalchemy import *
from sqlalchemy import select

In [21]:
logger = TransformLogger()
    
print('Record')
stmt = select(Record)
with logger.engine.connect() as conn:
    for row in conn.execute(stmt):
        print(row._mapping)

print('Transform')
stmt = select(Transform)
with logger.engine.connect() as conn:
    for row in conn.execute(stmt):
        print(row._mapping)

print('TransformApplied')
stmt = select(TransformApplied)
with logger.engine.connect() as conn:
    for row in conn.execute(stmt):
        print(row._mapping)

Record
{'id': 1, 'text': 'hide new secretions from the parental units ', 'target': '0', 'created_at': datetime.datetime(2022, 8, 9, 22, 36, 38)}
{'id': 2, 'text': 'contains no wit , only labored gags ', 'target': '0', 'created_at': datetime.datetime(2022, 8, 9, 22, 36, 38)}
{'id': 3, 'text': 'that loves its characters and communicates something rather beautiful about human nature ', 'target': '1', 'created_at': datetime.datetime(2022, 8, 9, 22, 36, 38)}
{'id': 4, 'text': 'remains utterly satisfied to remain the same throughout ', 'target': '0', 'created_at': datetime.datetime(2022, 8, 9, 22, 36, 38)}
{'id': 5, 'text': 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ', 'target': '0', 'created_at': datetime.datetime(2022, 8, 9, 22, 36, 38)}
Transform
{'id': 1, 'module_name': 'sibyl.transformations.text.insertion.sentiment_phrase', 'class_name': 'InsertPositivePhrase', 'class_args': 'null', 'class_kwargs': '{"task_name": "sentiment", "return_metadata": true}', 'c

## Transformation Wrappers

In [26]:
type(t_orig().np_random)

numpy.random._generator.Generator

In [11]:
text = ["This is a test.", "This isn't a test!"]
target = [0, 1]
batch = (text, target)

t_orig = TRANSFORMATIONS[0]

In [12]:
print("DPMLClassWrapper")

t_class_wrapped = DPMLClassWrapper(t_orig)
t_class_wrapped = t_class_wrapped(task_name="sentiment", return_metadata=True)

batch = t_class_wrapped.transform_batch(batch)

print("DPMLClassWrapper | transform_batch")
print(batch)
print("_class_name:", t_class_wrapped._class_name)
print("_class_args:", t_class_wrapped._class_args)
print("_class_kwargs:", t_class_wrapped._class_kwargs)
print("_class_rng:", t_class_wrapped._class_rng)
print("_callable_name:", t_class_wrapped._callable_name)
print("_callable_args:", t_class_wrapped._callable_args)
print("_callable_kwargs:", t_class_wrapped._callable_kwargs)
print("_callable_rng_state:", t_class_wrapped._callable_rng_state)

X, y, meta = t_class_wrapped.transform_Xy(text[1], target[1])

print("DPMLClassWrapper | transform_Xy")
print(X, y)
print("_class_name:", t_class_wrapped._class_name)
print("_class_args:", t_class_wrapped._class_args)
print("_class_kwargs:", t_class_wrapped._class_kwargs)
print("_class_rng:", t_class_wrapped._class_rng)
print("_callable_name:", t_class_wrapped._callable_name)
print("_callable_args:", t_class_wrapped._callable_args)
print("_callable_kwargs:", t_class_wrapped._callable_kwargs)
print("_callable_rng_state:", t_class_wrapped._callable_rng_state)

DPMLClassWrapper
DPMLClassWrapper | transform_batch
(['This is a test.', 'This is not a test!'], [0, 1])
_class_name: ExpandContractions
_class_args: []
_class_kwargs: {'task_name': 'sentiment', 'return_metadata': True}
_class_rng: Generator(PCG64)
_callable_name: transform_batch
_callable_args: []
_callable_kwargs: []
_callable_rng_state: {'bit_generator': 'PCG64', 'state': {'state': 129413257090554225206130458028910539494, 'inc': 16450919397810582319219321886622321693}, 'has_uint32': 0, 'uinteger': 0}
DPMLClassWrapper | transform_Xy
This is not a test! 1
_class_name: ExpandContractions
_class_args: []
_class_kwargs: {'task_name': 'sentiment', 'return_metadata': True}
_class_rng: Generator(PCG64)
_callable_name: transform_Xy
_callable_args: []
_callable_kwargs: []
_callable_rng_state: {'bit_generator': 'PCG64', 'state': {'state': 129413257090554225206130458028910539494, 'inc': 16450919397810582319219321886622321693}, 'has_uint32': 0, 'uinteger': 0}


In [75]:
t_init = t_orig(task_name="sentiment", return_metadata=True)

t_callable_wrapped = DPMLCallableWrapper(t_init.transform_batch)
batch = t_callable_wrapped(batch)

print("DPMLCallableWrapper | transform_batch")
print(batch)
print("_callable_name", t_callable_wrapped._callable_name)
print("_callable_args", t_callable_wrapped._callable_args)
print("_callable_kwargs", t_callable_wrapped._callable_kwargs)

t_callable_wrapped = DPMLCallableWrapper(t_init.transform_Xy)
X, y, meta = t_callable_wrapped(text[1], target[1])

print("DPMLCallableWrapper | transform_Xy")
print(X, y)
print("_callable_name", t_callable_wrapped._callable_name)
print("_callable_args", t_callable_wrapped._callable_args)
print("_callable_kwargs", t_callable_wrapped._callable_kwargs)

DPMLCallableWrapper | transform_batch
(['hide new secretions from the parental units ', 'contains no wit , only labored gags '], [0, 0])
_callable_name ('transform_batch',)
_callable_args []
_callable_kwargs []
DPMLCallableWrapper | transform_Xy
contains no wit , only labored gags  1
_callable_name ('transform_Xy',)
_callable_args []
_callable_kwargs []
