# DPML | Latency Replay

In this notebook, we investigate the reproducibility of transformation sequences captured by `dpml`.

## Load Dependencies

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from lineage import LeBatch
from lineage.transformation import DPMLClassWrapper, DPMLCallableWrapper
from lineage.utils import replay_all_from_csv

from sibyl import *
from datasets import concatenate_datasets, load_dataset

import os
import time
from tqdm.notebook import tqdm

## Create Datasets

In [16]:
dataset = load_dataset("glue", "sst2", split="train[:50000]")
dataset = dataset.rename_column('sentence', 'text')

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


## Replay Test

### Routine to be Tracked

In [4]:
scheduler = SibylTransformScheduler("sentiment", class_wrapper=DPMLClassWrapper)
stochastic_list = [Concept2Sentence, ConceptMix, ChangeSynonym, ChangeAntonym, ChangeHyponym, TextMix, SentMix, WordMix]

In [None]:
num_trials = 3
batch_size= 10

text, label = dataset['text'], dataset['label'] 
new_text, new_label = [], []

scheduler.num_INV = 1
scheduler.num_SIB = 1

transform_schedule = []
for i in tqdm(range(0, len(label), batch_size)):
    transforms = []
    for transform in scheduler.sample():
        if transform.wrapped_class in stochastic_list:
            continue
        transforms.append(transform)
    transform_schedule.append(transforms)

no_lineage_text, no_lineage_targets, no_lineage_times = [], [], []
replay_logging_records, replay_logging_times, replay_generation_times, num_mismatches = [], [], [], []
for trial in tqdm(range(num_trials)):
    
    # no lineage ====================================================================================================
    startTime = time.perf_counter()
    for i, t_sched in zip(range(0, len(label), batch_size), transform_schedule):
        text_batch = text[i:i+batch_size]
        label_batch = label[i:i+batch_size]
        batch = (text_batch, label_batch)
        for transform in t_sched:
            batch = transform.transform_batch(batch)
        no_lineage_text.extend(batch[0])
        no_lineage_targets.extend(batch[1])
    run_time = time.perf_counter() - startTime
    no_lineage_times.append(run_time)
    print('Elapsed time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    # replay logging ================================================================================================
    startTime = time.perf_counter()
    for i, t_sched in zip(range(0, len(label), batch_size), transform_schedule):
        text_batch = text[i:i+batch_size]
        label_batch = label[i:i+batch_size]
        batch = (text_batch, label_batch)
        with LeBatch(original_batch=batch) as le_batch:
            for transform in t_sched:
                batch = le_batch.apply(batch, transform.transform_batch)
            replay_logging_records.extend(batch)
    run_time = time.perf_counter() - startTime
    replay_logging_times.append(run_time)
    print('Elapsed logging time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    # replay generation ==============================================================================================
    
    startTime = time.perf_counter()
    new_records = replay_all_from_csv()
    run_time = time.perf_counter() - startTime
    replay_generation_times.append(run_time)
    print('Elapsed replay time for Trial {0}: {1:6.3f} seconds'.format(trial, run_time))
    
    original_records = [(text, target) for text, target in zip(no_lineage_text, no_lineage_targets)]
    num_mismatch = 0
    for old_r, new_r in zip(original_records, new_records):
        if old_r[0] != new_r[0] or np.any(old_r[1] != new_r[1]):
            num_mismatch += 1
    num_mismatches.append(num_mismatch)
    print('Replay mismatches for Trial {0}: {1}'.format(trial, num_mismatch))
    
    if os.path.exists(le_batch.transform_logger.path):
        os.remove(le_batch.transform_logger.path)

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
print("no_lineage_times:", np.mean(no_lineage_times))
print("replay_logging_times:", np.mean(replay_logging_times))
print("replay_generation_times:", np.mean(replay_generation_times))
print("num_mismatches:", np.mean(num_mismatches))

no_lineage_times: 11.107379699999996
replay_logging_times: 12.81577973333333
replay_generation_times: 75.26948553333334
num_mismatches: 4637.0


# Replay Functionality

In [7]:
from lineage.utils import *

## Replay Functionality - CSV

In [24]:
from lineage.storage.csv.transform_logger import TransformLogger as CSVTransformLogger
    
# fetch data
logger = CSVTransformLogger()
df = pd.read_csv(logger.path, header=None, names=['batch_id', 'text', 'target', 'transform_prov'])

df

Unnamed: 0,batch_id,text,target,transform_prov
0,414,hide new secretions from the parental units,0,"['{""module_name"": ""sibyl.transformations.text...."
1,414,"contains no wit , only labored gags",0,"['{""module_name"": ""sibyl.transformations.text...."
2,414,that loves its characters and communicates som...,1,"['{""module_name"": ""sibyl.transformations.text...."
3,414,remains utterly satisfied to remain the same t...,0,"['{""module_name"": ""sibyl.transformations.text...."
4,414,on the worst revenge-of-the-nerds clichés the ...,0,"['{""module_name"": ""sibyl.transformations.text...."
...,...,...,...,...
995,513,you wish you were at home watching that movie ...,0,"['{""module_name"": ""sibyl.transformations.text...."
996,513,'s no point in extracting the bare bones of by...,0,"['{""module_name"": ""sibyl.transformations.text...."
997,513,underdeveloped,0,"['{""module_name"": ""sibyl.transformations.text...."
998,513,the jokes are flat,0,"['{""module_name"": ""sibyl.transformations.text...."


In [25]:
startTime = time.perf_counter()
new_records = replay_all_from_csv()
print('Elapsed time: {:6.3f} seconds'.format(time.perf_counter() - startTime))
new_records

Elapsed time: 27.011 seconds


[('hide new from the parental units That being said, I loved it.',
  array([0.75, 0.25])),
 ('contains no wit , labored That being said, pleased.', array([0.75, 0.25])),
 ("that loves its communicates rather beautiful human nature That being said, I couldn't be happier.",
  array([0.25, 0.75])),
 ('remains utterly satisfied remain the same throughout That said, I be happier.',
  array([0.75, 0.25])),
 ('on the worst clichés filmmakers could up That being said, I it.',
  array([0.75, 0.25])),
 ('that too tragic to merit such superficial That said, I loved it.',
  array([0.75, 0.25])),
 ("that the director of blockbusters as patriot games can turn out a , personal film an emotional wallop . That said, I couldn't happier.",
  array([0.25, 0.75])),
 ('of saucy being I happier.', array([0.25, 0.75])),
 ("a depressed 's suicidal poetry That said, I couldn't be happier.",
  array([0.75, 0.25])),
 ("are more deeply thought through in most ` right-thinking ' films That said, couldn't be",
  arr

In [26]:
original_records = [(le.text, le.target) for le in records]
equiv_result = True
for old_r, new_r in zip(original_records, new_records[-4:]):
    if old_r[0] != new_r[0] or np.any(old_r[1] != new_r[1]):
        print(old_r, new_r)
        equiv_result = False
equiv_result

('hide new from the parental units That being said, I loved it.', array([0.75, 0.25])) ("'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance  🤾🏻 🇨🇨", array([0.75, 0.25]))
('contains no wit , labored That being said, pleased.', array([0.75, 0.25])) ('underdeveloped  🤾🏻\u200d♂️ 🧏\u200d♂', array([0.75, 0.25]))
("that loves its communicates rather beautiful human nature That being said, I couldn't be happier.", array([0.25, 0.75])) ('the jokes are flat  💁🏿\u200d♂️ 🟧', array([0.75, 0.25]))
('remains utterly satisfied remain the same throughout That said, I be happier.', array([0.75, 0.25])) ('a heartening tale of small victories  👩\u200d❤\u200d👨 🈶', array([0.25, 0.75]))


False

## Replay Functionality - SQL

In [10]:
new_records = replay_all_from_db()
new_records

[(["b'hide raw from the parental unit pelt raw secretion from the parental unit ' That being I it."],
  [array([[0.75, 0.25]])]),
 (["b'contains no , only laboured laugh arrest no mentality , only laboured laugh ' That being I it."],
  [array([[0.75, 0.25]])]),
 (["a fictional type is nonpareil of the main characters. b'that know its type and transmit something kinda beautiful about human nature  that know its type an\t transmit something kinda beautiful about human nature '"],
  [[[0.0, 1.0]]]),
 (["fisher rest satisfied with the effect of the experiment. b'remains perfectly satisfied to rest the same end-to-end  rest perfectly sa\tisfied to rest the same end-to-end '"],
  [[[1.0, 0.0]]])]

In [14]:
[(le.text, le.target) for le in records]

[("b'hide unexampled from the parental whole check no wit , only moil joke ' That being I it.",
  array([[0.75, 0.25]])),
 ("b'contains no , only labored muzzle humour only labored ' That being said, I'm pleased.",
  array([[0.75, 0.25]])),
 ("symbiotic kinship between organism and their nature. b'remains dead slaked to persist the same throughout  persist dead slaked to persist t/e same throughout '",
  [[0.5396552085876465, 0.4603447914123535]]),
 ("the cadaver of an being that has not been slaked b'remains dead slaked to remain the same throughout  cadaver dead slaked to remain t/e same throughout '",
  [[1.0, 0.0]])]

## Transformation Wrappers

In [54]:
text = ["This is a test.", "This isn't a test!"]
target = [0, 1]
batch = (text, target)

t_orig = TRANSFORMATIONS[0]

In [74]:
print("DPMLClassWrapper")

t_class_wrapped = DPMLClassWrapper(t_orig)
t_class_wrapped = t_class_wrapped(task_name="sentiment", return_metadata=True)

batch = t_class_wrapped.transform_batch(batch)

print("DPMLClassWrapper | transform_batch")
print(batch)
print("_class_name:", t_class_wrapped._class_name)
print("_class_args:", t_class_wrapped._class_args)
print("_class_kwargs:", t_class_wrapped._class_kwargs)
print("_callable_name:", t_class_wrapped._callable_name)
print("_callable_args:", t_class_wrapped._callable_args)
print("_callable_kwargs:", t_class_wrapped._callable_kwargs)

X, y, meta = t_class_wrapped.transform_Xy(text[1], target[1])

print("DPMLClassWrapper | transform_Xy")
print(X, y)
print("_class_name:", t_class_wrapped._class_name)
print("_class_args:", t_class_wrapped._class_args)
print("_class_kwargs:", t_class_wrapped._class_kwargs)
print("_callable_name:", t_class_wrapped._callable_name)
print("_callable_args:", t_class_wrapped._callable_args)
print("_callable_kwargs:", t_class_wrapped._callable_kwargs)

DPMLClassWrapper
DPMLClassWrapper | transform_batch
(['hide new secretions from the parental units ', 'contains no wit , only labored gags '], [0, 0])
_class_name: ExpandContractions
_class_args: []
_class_kwargs: {'task_name': 'sentiment', 'return_metadata': True}
_callable_name: ('transform_batch',)
_callable_args: []
_callable_kwargs: []
DPMLClassWrapper | transform_Xy
contains no wit , only labored gags  1
_class_name: ExpandContractions
_class_args: []
_class_kwargs: {'task_name': 'sentiment', 'return_metadata': True}
_callable_name: ('transform_Xy',)
_callable_args: []
_callable_kwargs: []


In [75]:
t_init = t_orig(task_name="sentiment", return_metadata=True)

t_callable_wrapped = DPMLCallableWrapper(t_init.transform_batch)
batch = t_callable_wrapped(batch)

print("DPMLCallableWrapper | transform_batch")
print(batch)
print("_callable_name", t_callable_wrapped._callable_name)
print("_callable_args", t_callable_wrapped._callable_args)
print("_callable_kwargs", t_callable_wrapped._callable_kwargs)

t_callable_wrapped = DPMLCallableWrapper(t_init.transform_Xy)
X, y, meta = t_callable_wrapped(text[1], target[1])

print("DPMLCallableWrapper | transform_Xy")
print(X, y)
print("_callable_name", t_callable_wrapped._callable_name)
print("_callable_args", t_callable_wrapped._callable_args)
print("_callable_kwargs", t_callable_wrapped._callable_kwargs)

DPMLCallableWrapper | transform_batch
(['hide new secretions from the parental units ', 'contains no wit , only labored gags '], [0, 0])
_callable_name ('transform_batch',)
_callable_args []
_callable_kwargs []
DPMLCallableWrapper | transform_Xy
contains no wit , only labored gags  1
_callable_name ('transform_Xy',)
_callable_args []
_callable_kwargs []
