In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
with open('fake_real.json') as f:
    fake_real = pd.read_json(f, lines=True)

In [3]:
with open('20190601.json') as f:
    forecast = pd.read_json(f, lines=True)

In [4]:
app_map = [(re.compile(r'cootek\.smartinput\.(international|mainland)\.(ios|android).*'), 'keyboard')]
plugin_map = [(re.compile(r'(cootek\.smartinput\.android|com\.cootek\.smartinputv5)\.skin\..*'), 'skin'),
              (re.compile(r'(cootek\.smartinput.android|com\.cootek\.smartinputv5)\.language.*'), 'language'),
              (re.compile(r'(cootek\.smartinput\.android|com\.cootek\.smartinputv5)\.font.*'), 'font'),
              (re.compile(r'(cootek\.smartinput\.android|com\.cootek\.smartinputv5)\.emoji.*'), 'emoji'),
              (re.compile(r'cootek.smartinput.android.*touchpal.emoji.*'), 'emoji'),
              (re.compile(r'(cootek\.smartinput\.android|com\.cootek\.smartinputv5)\.sticker.*'), 'sticker'),
              (re.compile(r'(cootek\.smartinput\.android|com\.cootek\.smartinputv5)\.celldict.*'), 'celldict'),
              (re.compile(r'com.cootek.smartinputv5.boomtextv2.*'), 'boomtext')]
matrix_plugin_map = [
    (re.compile(r'com\.color\.call\.flash\.colorphone\.theme\..*'), 'com.color.call.flash.colorphone.theme')]
regex_map = app_map + plugin_map + matrix_plugin_map


def app_name2bundle(app_name):
    if not app_name:
        return app_name
    for (k, v) in regex_map:
        if k.search(app_name):
            return v
    return app_name

In [6]:
def preprocessing_data(forecast, real, target, attributions):
    forecast['app_name'] = forecast['app_name'].map(app_name2bundle)
    real['app_name'] = real['app_name'].map(app_name2bundle)
    forecast = forecast['impression'].groupby \
        (by=[forecast.app_name, forecast.country, forecast.id_type, forecast.platform, forecast.tu]).sum().reset_index()
    forecast = forecast.replace(['none', ''], np.nan).dropna().reset_index(drop=True)  # drop nan
    forecast.tu = forecast.tu.astype(int)
    real = real['impression'].groupby \
        (by=[real.app_name, real.country, real.id_type, real.platform, real.tu]).sum().reset_index()
    real = real.replace(['none', ''], np.nan).dropna().reset_index(drop=True)  # drop nan
    real.tu = real.tu.astype(int)
    temp = pd.concat([forecast, real])
#     real = temp[temp[attributions].duplicated(keep='first')]
    real = temp['impression'].groupby \
        (by=[temp.app_name, temp.country, temp.id_type, temp.platform, temp.tu]).min().reset_index()
    return forecast, real

In [7]:
a, b=preprocessing_data(forecast, fake_real, 'impression', ['app_name', 'country', 'id_type', 'platform', 'tu'])

In [None]:
forecast['impression'].groupby(forecast.platform).sum() - fake_real['impression'].groupby(fake_real.platform).sum()

In [None]:
a['impression'].groupby(a.platform).sum() - b['impression'].groupby(b.platform).sum()

In [None]:
sum((a['impression'] - b['impression']))

In [None]:
(a.set_index(['app_name', 'country', 'id_type', 'platform', 'tu']) - b.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])).dropna(axis=0).impression.sum()

In [None]:
((a[mask_a].set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])-b[mask_b].set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])).dropna(axis=0)).impression.apply(lambda x: x**2).sum(

In [None]:
mask_a = ~a['platform'].isin(['sniper','flurry'])

In [None]:
mask_b = ~b['platform'].isin(['sniper','flurry'])

In [8]:
root_cause = {'app_name': ['com.qrcorde.scan.barcode.reader.generator'],
                        'country': [],
                        'id_type': [],
                        'platform': [],
                        'tu': []}

In [9]:
import copy

In [10]:
def get_scores(root_cause, forecast, real):
    # forecast, and real should be in pandas' dataframe
    f = copy.deepcopy(forecast)
    total_h = (forecast.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])\
               - real.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])).dropna(axis=0)\
               ['impression'].apply(lambda x:x**2).sum()
    if total_h == 0:
        raise Exception("Total KPI does not change!")
    f_sum, r_sum = 0, 0
    #         for cause in root_cause.items():
    #             if cause[1] != []:
    #                 f_sum += f[f[cause[0]].isin(cause[1])][self.target].sum()
    #                 r_sum += real[real[cause[0]].isin(cause[1])][self.target].sum()
    mask_f = pd.Series([True] * len(f))
    mask_r = pd.Series([True] * len(real))
    for cause in root_cause.items():
        if cause[1] != []:
            mask_f = mask_f & f[cause[0]].isin(cause[1])
            mask_r = mask_r & real[cause[0]].isin(cause[1])
    f_sum = f.loc[mask_f]['impression'].sum()
    r_sum = real.loc[mask_r]['impression'].sum()

    # modified reppile effect.
    h = f_sum - r_sum
    print(h)
    if h == 0:  # under the given root cause, kpi does not change
        return 0
    mask = pd.Series([True] * len(f))
    for cause in root_cause.items():
        if cause[1] != []:
            mask = mask & f[cause[0]].isin(cause[1])
    f = f.join(pd.DataFrame({'mask': mask}))
    f.loc[f['mask'] == True, 'impression'] = f.loc[f['mask'] == True, 'impression'] * (1 - h/ f_sum)
    f.drop(columns=['mask'], inplace =True)
    temp = (f.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])\
               - real.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])).dropna(axis=0)\
               ['impression'].apply(lambda x:x**2).sum()
    ps = 1 - np.sqrt(temp / total_h)
    return max(ps, 0)

In [11]:
get_scores(root_cause, a, b)

6859356.5


0.009984934567780601

In [None]:
b.app_name