In [1]:
from Pipeline.dataflow import FilesPipeline
from Pipeline.ccb.dsjson.processor import Processor
from Pipeline.ccb.dsjson.predictions import Predictor
from Pipeline.progress import tqdm_progress
from Pipeline import azure_blob_logs
from Pipeline.utils import Mapper

from pathlib import Path

In [2]:
local_data_root = Path('/Users/alextaim/data/ccb')

raw_folder = local_data_root.joinpath('raw')
slim_folder = local_data_root.joinpath('slim')
sample_folder = local_data_root.joinpath('sample')
predict_folder = local_data_root.joinpath('predict.pickle')
estimate_folder = local_data_root.joinpath('estimate.pickle')

In [3]:
files = ['/Users/alextaim/data/ccb/raw/01.json']

# Generate predictions

In [4]:
from VwPipeline import Loggers, Handlers
from VwPipeline.VwCache import VwCache
from VwPipeline.Vw import Vw
from VwPipeline.VwOpts import dimension, product
import pandas as pd

#your vw path
vw_path = r'vw'

cache = VwCache(r'/Users/alextaim/data/.vw_cache')
vw = Vw(
    vw_path,
    cache,
    handlers=[Handlers.WidgetHandler()],
    )

opts = pd.DataFrame(product(
    dimension('#base', ['--ccb_explore_adf --dsjson --compressed --synthcover --power_t 0  -P 1 --preserve_performance_counters --save_resume']),
))
preds = vw.train(files, opts, ['-p'])
prediction_files = preds.iloc[0]['!Outputs']['-p']
prediction_files

HBox(children=(HTML(value='Total'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

['/Users/alextaim/data/.vw_cache/cache-p/b2cd6f5f6d8927dd04d7cd89dac431db']

# Generate slim dsjson

In [5]:
from Pipeline.ccb.dsjson import processor

In [6]:
from Pipeline.dataflow import FilesPipeline
from Pipeline.ccb.dsjson.processor import Processor

processor = Processor()
pipeline = FilesPipeline()

slim = pipeline.lines_2_lines(files, processor.process, path_gen=Mapper(raw_folder, slim_folder), process=True)
slim

[PosixPath('/Users/alextaim/data/ccb/slim/01.json')]

# Sample

In [7]:
from Pipeline.dataflow import FilesPipeline
from Pipeline.ccb.dsjson.processor import Processor
from Pipeline.filters import UniformSampler

sampler = UniformSampler(0.5)

processor = Processor(filters=[lambda l: sampler.do(l)])
result = pipeline.lines_2_lines(files, processor.process, path_gen=Mapper(raw_folder, sample_folder), process=True)
result

[PosixPath('/Users/alextaim/data/ccb/sample/01.json')]

# Predict

In [8]:
from Pipeline.dataflow import FilesPipeline
from Pipeline.ccb.dsjson.predictions import Predictor

predictor = Predictor(filters=[lambda l: True])
pipeline = FilesPipeline()
cfe = pipeline.lines_2_df_pickle(slim, predictor.predict_df, path_gen=Mapper(slim_folder, predict_folder), process=True)
cfe

[PosixPath('/Users/alextaim/data/ccb/predict.pickle/01.json')]

# Preestimate

In [9]:
from Pipeline.dataflow import FilesPipeline
from Pipeline.estimators import Estimator, evaluate
import Pipeline.ccb.estimators
import json

pipeline = FilesPipeline()
estimator = Estimator(factory = Pipeline.ccb.estimators.create, estimators = {'baseline1_old': ['ccb|snips|0'], 'random': ['ccb|snips|0']}, online_estimator = 'ccb|ips|0')
preestimates = pipeline.df_pickle_2_df_pickle(cfe, lambda objects: estimator.preestimate_df(objects, '1min'), path_gen=Mapper(predict_folder, estimate_folder), process=True)


# Evaluate

In [12]:
baseline_preestimates = pd.concat([pd.read_pickle(p) for p in preestimates])
baseline_stats = evaluate(baseline_preestimates.resample('1min').sum())
baseline_stats

Unnamed: 0_level_0,"(online, ccb|ips|0, e)","(b, random, ccb|snips|0, e)","(b, baseline1_old, ccb|snips|0, e)"
t,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-03-27 17:08:00+00:00,0.333333,0.319149,0.333333
2021-03-27 17:10:00+00:00,0.091575,0.090595,0.08
2021-03-27 17:12:00+00:00,0.101002,0.083387,0.108808


# Evaluate predictions from vw

In [None]:
from Pipeline.ccb.vw import predictions
vw_pred_openers = list(map(lambda p: (lambda f: predictions.lines_2_predictions(open(p), open(f), 'cfe')), prediction_files))

pipeline = FilesPipeline()
estimator = Estimator(factory = Pipeline.ccb.estimators.create, estimators = {'cfe': ['ccb|snips|0'] })
preestimates = pipeline.ndjson_2_csv(cfe, lambda objects: estimator.preestimate(objects, '1min'), path_gen=lambda p: p.replace('predict', 'cfe'), process=True, openers=vw_pred_openers)


In [None]:
baseline_preestimates = pd.concat([estimator.read_preestimate(p) for p in preestimates])
cfe_stats = evaluate(baseline_preestimates.resample('1min').sum())
cfe_stats

# Merge evaluations

In [None]:
all_stats = baseline_stats.join(cfe_stats)