In [None]:
small = ['small1.json', 'small2.json']
large = ['01_0.json']

In [None]:
from VwPipeline import Loggers, Handlers
from VwPipeline.VwCache import VwCache
from VwPipeline.Vw import Vw
from VwPipeline.VwOpts import dimension, product
import pandas as pd

#your vw path
vw_path = r'C:\vw\vw.exe'

cache = VwCache(r'_cache')
vw = Vw(
    vw_path,
    cache,
    handlers=[Handlers.WidgetHandler()],
    )

opts = pd.DataFrame(product(
    dimension('#base', ['--ccb_explore_adf --dsjson --compressed --synthcover --power_t 0  -P 1 --preserve_performance_counters --save_resume']),
))
preds = vw.train(small, opts, ['-p'])
prediction_files = preds.iloc[0]['!Outputs']['-p']
prediction_files

# Dataflow

In [None]:
import json
import pandas as pd

from itertools import chain
from collections.abc import Iterable, Callable, Mapping
from pathlib import Path

def _load_hash(path: str) -> int:
    hash_path = f'{path}.hash'
    if not os.path.exists(hash_path):
        return None
    try:
        hash_value = int(open(hash_path, 'r').read())
    except:
        hash_value = None
    return hash_value

def _save_hash(hash_value: int, path: str) -> None:
    with open(f'{path}.hash','w') as f:
        f.write(str(hash_value))

def _evaluate_hash(path: str) -> int:
    return Path(path).stat().st_size

def _is_in_sync(input: str, output: str) -> bool:
    input_hash = _evaluate_hash(input)
    output_hash = _load_hash(output)
    return input_hash and output_hash and input_hash == output_hash

def _sync(input: str, output: str) -> None:
    _save_hash(_evaluate_hash(input), output)

def files_2_ndjsons(
    files: Iterable[str],
    processor: Callable[[Iterable[str]], dict],
    path_gen: Callable[[str], str] = None,
    process: bool=False) -> Iterable[str]:
    path_gen = path_gen or (lambda f: f'{f}.{processor.__name__}') 
    result = []
    for path_in in files:
        print(f'Processing {path_in}...')
        path_out = path_gen(path_in)
        Path(path_out).parent.mkdir(parents=True, exist_ok=True)
        if process or not _is_in_sync(path_in, path_out):
            with open(path_out, 'w') as fout:
                with open(path_in) as fin:
                    fout.writelines(map(lambda o: f'{json.dumps(o)}\n', processor(fin)))
            _sync(path_in, path_out)
        if Path(path_out).exists():
            result.append(path_out)
    return result

def files_2_csvs(
    files: Iterable[str],
    processor: Callable[[Iterable[str]], Iterable[dict]],
    path_gen: Callable[[str], str] = None,
    process: bool=False) -> Iterable[str]:
    result = []
    for f in files:
        print(f'Processing {f}...')
        output = path_gen(f)
        Path(output).parent.mkdir(parents=True, exist_ok=True)
        if process or not _is_in_sync(f, output):
            df = pd.DataFrame(processor(open(f)))
            if len(df) > 0:
                df.to_csv(output, index=False)
            _sync(f, output)
        if Path(output).exists():
            result.append(output)
    return result

def csvs_2_rows(files, processors=[]):
    if not processors:
        processors = [lambda d: d]
    for kv in chain.from_iterable(map(lambda f: pd.read_csv(f).iterrows(), files)):
        yield ChainMap(*[p(kv[1]) for p in processors])

def ndjsons_2_rows(files, processors=[]):
    if not processors:
        processors = [lambda d: d]
    for o in map(lambda l: json.loads(l), chain.from_iterable(map(lambda f: open(f), files))):
        yield o

# Parsers

In [None]:
import itertools
import json
import uuid
import pandas as pd
from collections import ChainMap

class DsJsonCcb:
    context_processors = []
    action_processors = []
    slot_processors = []
    activated_only = []

    def __init__(self, context_processors=[], slot_processors=[], activated_only=True):
        self.context_processors = context_processors
        self.slot_processors = slot_processors

    def _is_decision(self, line):
        return line.startswith('{"Timestamp"')

    def lines_2_lines(self, lines):
        return filter(lambda l: self._is_decision(l), lines)

    def line_2_ndjson(self, line):
        parsed = json.loads(line)
        shared = dict(ChainMap({ 'T': parsed['Timestamp'], #pd.to_datetime(parsed['Timestamp']),
                    'SkipLearn': False if '_skipLearn' not in parsed else parsed['_skipLearn'],
                    'Pdrop': 0.0 if 'pdrop' not in parsed else parsed['pdrop']},
                     *[p(parsed['c']) for p in self.context_processors]))

        actions = [None] * len(parsed['c']['_multi'])
        for i, o in enumerate(parsed['c']['_multi']):
            actions[i] = dict(ChainMap(*[p(o) for p in self.action_processors]))

        slots = [None] * len(parsed['_outcomes'])
        for i, o in enumerate(zip(parsed['c']['_slots'], parsed['_outcomes'])):
            slots[i] = dict(ChainMap({
                    'Reward': o[1]['_label_cost'],
                    'Id': o[1]['_id'],
                    'Chosen': o[1]['_a'][0],
                    'P': o[1]['_p'][0],
                    'Inc': o[0]['_inc'] if '_inc' in o[0] else []},
                    *[p(*o) for p in self.slot_processors]))
        
        return {'shared': shared, '_multi': actions, 'slots': slots}

    def line_2_slots(self, line):
        parsed = json.loads(line)
        session = {'Session': str(uuid.uuid4()),
                    'Timestamp': pd.to_datetime(parsed['Timestamp']),
                    'NumActions': len(parsed['c']['_multi']),
                    'NumSlots': len(parsed['c']['_slots']),
                    'SkipLearn': False if '_skipLearn' not in parsed else parsed['_skipLearn'],
                    'Pdrop': 0.0 if 'pdrop' not in parsed else parsed['pdrop']}
        session_custom = [p(parsed['c']) for p in self.context_processors]

        slots = [None] * len(parsed['_outcomes'])
        for i, o in enumerate(parsed['_outcomes']):
            slots[i] = ChainMap({'SlotIdx': i,
                    'Cost': o['_label_cost'],
                    'EventId': o['_id'],
                    'ActionsPerSlot': len(o['_a']),
                    'Chosen': o['_a'][0],
                    'Prob': o['_p'][0]},
                    *[p(o) for p in self.slot_processors])
        
        return map(lambda s: ChainMap(session, *session_custom, s), slots)
    
    def lines_2_slots(self, lines):
        return itertools.chain.from_iterable(map(lambda l: self.line_2_slots(l), self.lines_2_lines(lines)))

    def lines_2_ndjson(self, lines):
        return map(lambda l: self.line_2_ndjson(l), self.lines_2_lines(lines))
    

class VwPredictionsCcb:
    @staticmethod
    def line_2_slot(line):
        return {p.split(':')[0] : float(p.split(':')[1])  for p in line.split(',')}

    @staticmethod
    def lines_2_slots(lines):
        return map(VwPredictionsCcb.line_2_slot, filter(lambda l : not l.isspace(), lines))

    @staticmethod
    def files_2_slots(files):
        return itertools.chain.from_iterable(map(lambda f: VwPredictionsCcb.lines_2_slots(open(f)), files))

# Processors

In [None]:
def timestamp(row):
    return {'T': row['Timestamp']}

In [None]:
pd.DataFrame(filter(lambda s: s['SkipLearn']==False,
    csvs_2_rows(slot_files, processors = [
        timestamp
    ])))

In [None]:
parser = DsJsonCcb()
result = files_2_ndjsons(small, parser.lines_2_ndjson, path_gen=lambda p: fr'processed\{p}')
result

In [None]:
files = small

parser = DsJsonCcb()
slot_files= files_2_csvs(files, parser.lines_2_slots, path_gen=lambda p: fr'processed\{p}.txt')

In [None]:
slots = filter(lambda s: s['SkipLearn']==False, csvs_2_rows(slot_files))
preds = VwPredictionsCcb.files_2_slots(prediction_files)

ds = map(lambda kv: ChainMap(kv[0], kv[1]), zip(slots, preds))

In [None]:
pd.DataFrame(ds)

In [None]:
pd.DataFrame(VwPredictionsCcb.files_2_slots(prediction_files))

In [None]:
pd.DataFrame(map(lambda kv: ChainMap(kv[0], kv[1]), zip(slots, preds)))

In [None]:
parser = DsJsonCcb()
list(parser.lines_2_slots(open(small[0])))

In [None]:
large_result = parser.lines_2_slots(open(large[0]))

In [None]:
next(large_result)

In [None]:
list(zip(DsJsonCcb.files_2_slots(inputs,context_processors = [cp]), VwPredictionsCcb.lines_2_slots(open(prediction_file))))

In [None]:
list(DsJsonCcb.files_2_slots(inputs,context_processors = [cp]))

In [None]:
import pandas as pd
import json
import uuid
import itertools
import pytz

class DsJson:
    @staticmethod
    def is_ccb_event(line):
        try:
            o = json.loads(line)
        except:
            return False
        return line.startswith('{"Timestamp"')

    @staticmethod
    def is_cb_event(line):
        return line.startswith('{"_label_cost"')

    @staticmethod
    def is_dangling_reward(line):
        return line.startswith('{"RewardValue')

    @staticmethod
    def get_timestamp(line):
        obj = NaiveJson(line)
        if line.startswith('{"RewardValue'):
            return pd.to_datetime(obj.get_string("EnqueuedTimeUtc"))
        return pd.to_datetime(obj.get_string("Timestamp"))

    @staticmethod
    def context(line):
        parsed = json.loads(line)
        return json.dumps(parsed['c']) + '\n'

    @staticmethod
    def dangling_reward(line):
        parsed = json.loads(line)
        return {'Timestamp': pd.to_datetime(parsed['EnqueuedTimeUtc']), 'EventId': parsed['EventId'], 'Reward': parsed['RewardValue']}

    @staticmethod
    def analyze_observations(obj):
        rewards = 0
        activations =0
        for o in obj:
            if 'ActionTaken' in o and o['ActionTaken']==True:
                activations = activations + 1
            else:
                rewards = rewards + 1

        return rewards, activations

    def get_title_from_obj(action):
        c = action['c']
        if 'Title' in c:
            return c['Title']
        elif 'ProductTitle' in c:
            return c['ProductTitle']
        return None

    @staticmethod
    def ccb_event(line):
        parsed = json.loads(line)
        session = {'Session': str(uuid.uuid4()),
                 'Timestamp': pd.to_datetime(parsed['Timestamp']),
                 'NumActions': len(parsed['c']['_multi']),
                 'NumSlots': len(parsed['c']['_slots']),
                 'VWState': parsed['VWState']['m'],
                 'SkipLearn': False if '_skipLearn' not in parsed else parsed['_skipLearn'],
                 'StringLen': len(line),
                 'Pdrop': 0.0 if 'pdrop' not in parsed else parsed['pdrop']}

        multi = [None] * len(parsed['c']['_multi'])
        for i, o in enumerate(parsed['c']['_multi']):
            multi[i] = {'Id': o['c']['Id'],
                        'Len': len(json.dumps(o))}

        slots = [None] * len(parsed['_outcomes'])
        for i, o in enumerate(parsed['_outcomes']):
            r, a = DsJson.analyze_observations(o['_o'])
            slots[i] = {'SlotIdx': i,
                    'Cost': o['_label_cost'],
                    'EventId': o['_id'],
                    'ActionsPerSlot': len(o['_a']),
                    'Chosen': o['_a'][0],
                    'Prob': o['_p'][0],
                    'Rewards': r,
                    'Activations': a,
                    'Product': multi[o['_a'][0]]['Id'],
                    'ChosenActionLen': multi[o['_a'][0]]['Len']}
        
        return [dict(session, **m) for m in multi] 

    @staticmethod
    def ccb_2_cb(session, slots, multi):
        return [dict(session, **s) for s in slots]

    @staticmethod
    def ccb_as_cb_to_stats(df):
        result = df
        result['TimestampFloor'] = result.index.floor('1min')
        result['TimestampFloor'] = result['TimestampFloor'].dt.tz_localize(None)
        result['Observations'] = result['HasObservation'].astype(int).div(1 - result['Pdrop'])
        result['Rewards'] = -result['Cost'].div(1 - result['Pdrop'])
        result['Events'] = 1
        result['EventsLogged'] = result['Events']
        result['Events'] = result['Events'].div(1 - result['Pdrop'])
        result['RewardsSlot1'] = result['Rewards'].mul((result['SlotIdx']==0).astype(int))
        result['EventsSlot1'] = result['Events'].mul((result['SlotIdx']==0).astype(int))
        result['RewardsIps1'] = result['Rewards'].mul((result['SlotIdx']==result['Chosen']).astype(int)).div(result['Prob'])
        result['EventsIps1'] = result['Events'].mul((result['SlotIdx']==result['Chosen']).astype(int)).div(result['Prob'])
        result['RewardsIps1Slot1'] = result['RewardsIps1'].mul((result['SlotIdx']==0).astype(int))
        result['EventsIps1Slot1'] = result['EventsIps1'].mul((result['SlotIdx']==0).astype(int))
        result['RewardsIpsR'] = result['Rewards'].mul(result['ActionsPerSlot']).div(result['Prob'])
        result['EventsIpsR'] = result['Events'].mul(result['ActionsPerSlot']).div(result['Prob'])
        result['RewardsIpsRSlot1'] = result['RewardsIpsR'].mul((result['SlotIdx']==0).astype(int))
        result['EventsIpsRSlot1'] = result['EventsIpsR'].mul((result['SlotIdx']==0).astype(int))

        return result[['TimestampFloor', 'Observations', 'Rewards', 'Events', 'RewardsSlot1', 'EventsSlot1', 'RewardsIps1', 'EventsIps1', 'RewardsIps1Slot1', 'EventsIps1Slot1', 'RewardsIpsR', 'EventsIpsR', 'RewardsIpsRSlot1', 'EventsIpsRSlot1', 'EventsLogged']].reset_index().drop('Timestamp', axis=1).rename(columns = {'TimestampFloor': 'Timestamp'}).groupby('Timestamp').sum()

    @staticmethod
    def get_title_from_obj(action):
        c = action['c']
        if 'Title' in c:
            return c['Title']
        elif 'ProductTitle' in c:
            return c['ProductTitle']
        return None
    
    @staticmethod
    def ccb_action(line):
        parsed = json.loads(line)
        session = {'Session': parsed['_outcomes'][0]['_id'], 'Timestamp': pd.to_datetime(parsed['Timestamp'])}
        multi = [None] * len(parsed['c']['_multi'])
        for i, o in enumerate(parsed['c']['_multi']):
            multi[i] = {'Id': o['c']['Id'],
                        'Index': i,
                        'ChannelId': o['c']['Id'],
                        'Title': DsJson.get_title_from_obj(o),
                        'SlotIdx': -1,
                        'Cost': 0,
                        'Prob': 0,
                        'ActionLen': len(str(o)),
                        'CLen': len(o['c']),
                        'DLen': len(o['d']),
                        'ELen': len(o['e']),
                        'HLen': len(o['h']),                        
       #                 'plc0': o['c']['plc0'],
       #                 'plc1': o['c']['plc1'],
       #                 'plc2': o['c']['plc2'],
       #                 'plc3': o['c']['plc3'],
       #                 'plc4': o['c']['plc4'],
                       }
      #      for key in o['c']:
      #          multi[i][f'c/{key}'] = o['c'][key]
        for i, o in enumerate(parsed['_outcomes']):
            multi[o['_a'][0]]['SlotIdx'] = i
            multi[o['_a'][0]]['Cost'] = o['_label_cost']
            multi[o['_a'][0]]['Prob'] = o['_p'][0]
        return [dict(session, **m) for m in multi]      

    @staticmethod
    def dangling_reward_lines(lines):
        return filter(lambda l: DsJson.is_dangling_reward(l), lines)

    @staticmethod
    def ccb_decision_lines(lines):
        return filter(lambda l: DsJson.is_ccb_event(l), lines)
    
    @staticmethod
    def dangling_rewards(lines):
        df = pd.DataFrame(
            map(lambda l: DsJson.dangling_reward(l), DsJson.dangling_reward_lines(lines)))
        return df.set_index('Timestamp') if len(df) > 0 else df

    @staticmethod
    def ccb_events(lines):
        events = map(lambda l: DsJson.ccb_2_cb(*DsJson.ccb_event(l)), DsJson.ccb_decision_lines(lines))
        df = pd.DataFrame(itertools.chain(*events))
        return df#.set_index('Timestamp')

    @staticmethod
    def ccb_stats(lines):
        events = map(lambda l: DsJson.ccb_2_cb(*DsJson.ccb_event(l)), DsJson.ccb_decision_lines(lines))
        df = pd.DataFrame(itertools.chain(*events))
        return DsJson.ccb_as_cb_to_stats(df.set_index('Timestamp'))

    @staticmethod
    def ccb_actions(lines):
        actions = map(lambda l: DsJson.ccb_action(l), DsJson.ccb_decision_lines(lines))
        df = pd.DataFrame(itertools.chain(*actions))
        return df.set_index('Timestamp')

    @staticmethod
    def contexts(lines):
        return map(lambda e: DsJson.context(e),
            filter(lambda l: DsJson.is_ccb_event(l), lines))
    
    @staticmethod
    def first_timestamp(lines):
        line = next(lines)
        return DsJson.get_timestamp(line)
    
def ccb_actions(file):
    return DsJson.ccb_actions(open(file, 'r', encoding='utf-8'))

def ccb_slots(file):
    return DsJson.ccb_events(open(file, 'r', encoding='utf-8'))

def dangling_rewards(file):
    return DsJson.dangling_rewards(open(file, 'r', encoding='utf-8'))