In [1]:
import pandas as pd
import numpy as np
import re
import copy

In [2]:
with open('20190609.json') as f:
    real = pd.read_json(f, lines=True)

In [3]:
with open('20190601.json') as f:
    forecast = pd.read_json(f, lines=True)

In [4]:
app_map = [(re.compile(r'cootek\.smartinput\.(international|mainland)\.(ios|android).*'), 'keyboard')]
plugin_map = [(re.compile(r'(cootek\.smartinput\.android|com\.cootek\.smartinputv5)\.skin\..*'), 'skin'),
              (re.compile(r'(cootek\.smartinput.android|com\.cootek\.smartinputv5)\.language.*'), 'language'),
              (re.compile(r'(cootek\.smartinput\.android|com\.cootek\.smartinputv5)\.font.*'), 'font'),
              (re.compile(r'(cootek\.smartinput\.android|com\.cootek\.smartinputv5)\.emoji.*'), 'emoji'),
              (re.compile(r'cootek.smartinput.android.*touchpal.emoji.*'), 'emoji'),
              (re.compile(r'(cootek\.smartinput\.android|com\.cootek\.smartinputv5)\.sticker.*'), 'sticker'),
              (re.compile(r'(cootek\.smartinput\.android|com\.cootek\.smartinputv5)\.celldict.*'), 'celldict'),
              (re.compile(r'com.cootek.smartinputv5.boomtextv2.*'), 'boomtext')]
matrix_plugin_map = [
    (re.compile(r'com\.color\.call\.flash\.colorphone\.theme\..*'), 'com.color.call.flash.colorphone.theme')]
regex_map = app_map + plugin_map + matrix_plugin_map


def app_name2bundle(app_name):
    if not app_name:
        return app_name
    for (k, v) in regex_map:
        if k.search(app_name):
            return v
    return app_name

In [5]:
def preprocessing_data(forecast, real, target, attributions):
    forecast['app_name'] = forecast['app_name'].map(app_name2bundle)
    real['app_name'] = real['app_name'].map(app_name2bundle)
    forecast = forecast['impression'].groupby \
        (by=[forecast.app_name, forecast.country, forecast.id_type, forecast.platform, forecast.tu]).sum().reset_index()
    forecast = forecast.replace(['none', ''], np.nan).dropna().reset_index(drop=True)  # drop nan
    forecast.tu = forecast.tu.astype(int)
    real = real['impression'].groupby \
        (by=[real.app_name, real.country, real.id_type, real.platform, real.tu]).sum().reset_index()
    real = real.replace(['none', ''], np.nan).dropna().reset_index(drop=True)  # drop nan
    real.tu = real.tu.astype(int)
    temp = pd.concat([forecast, real])
#     real = temp[temp[attributions].duplicated(keep='first')]
    real = temp['impression'].groupby \
        (by=[temp.app_name, temp.country, temp.id_type, temp.platform, temp.tu]).min().reset_index()
    return forecast, real

In [44]:
a, b=preprocessing_data(forecast, real, 'impression', ['app_name', 'country', 'id_type', 'platform', 'tu'])

In [None]:
(a['impression'].groupby(a['app_name']).sum().reset_index().set_index('app_name') - b['impression'].groupby(b['app_name']).sum().reset_index().set_index('app_name')).reset_index().sort_values(by='impression',ascending = False)

In [17]:
len(a)

251831

In [19]:
(a[0:50000].set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])-b[0:50000].set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])).impression.sum()
#.dropna(axis=0)['impression'].apply(lambda x:x**2).sum()

1398669375.0

In [21]:
(a[0:50000]['impression'] - b[0:50000]['impression']).sum()

11194508.0

In [None]:
def get_scores(root_cause, forecast, real):
    # forecast, and real should be in pandas' dataframe
    f = copy.deepcopy(forecast)
    total_h = (forecast.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])\
               - real.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])).dropna(axis=0)\
               ['impression'].apply(lambda x:x**2).sum()
    if total_h == 0:
        raise Exception("Total KPI does not change!")
    f_sum, r_sum = 0, 0
    #         for cause in root_cause.items():
    #             if cause[1] != []:
    #                 f_sum += f[f[cause[0]].isin(cause[1])][self.target].sum()
    #                 r_sum += real[real[cause[0]].isin(cause[1])][self.target].sum()
    mask_f = pd.Series([True] * len(f))
    mask_r = pd.Series([True] * len(real))
    for cause in root_cause.items():
        if cause[1] != []:
            mask_f = mask_f & f[cause[0]].isin(cause[1])
            mask_r = mask_r & real[cause[0]].isin(cause[1])
    f_sum = f.loc[mask_f]['impression'].sum()
    r_sum = real.loc[mask_r]['impression'].sum()

    # modified reppile effect.
    h = f_sum - r_sum
    if h == 0:  # under the given root cause, kpi does not change
        return 0
    mask = pd.Series([True] * len(f))
    for cause in root_cause.items():
        if cause[1] != []:
            mask = mask & f[cause[0]].isin(cause[1])
    f = f.join(pd.DataFrame({'mask': mask}))
    f.loc[f['mask'] == True, 'impression'] = f.loc[f['mask'] == True, 'impression'] * (1 - h/ f_sum)
    f.drop(columns=['mask'], inplace =True)
    temp = (f.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])\
               - real.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])).dropna(axis=0)\
               ['impression'].apply(lambda x:x**2).sum()
    ps = 1 - np.sqrt(temp / total_h)
    return max(ps, 0)

In [None]:
root_cause = {'app_name': [],
                'country': [],
                'id_type': [],
                'platform': ['admob','sniper','mopub','adx'],
                'tu': []}

In [None]:
get_scores(root_cause, c, d)

In [None]:
f = copy.deepcopy(c)
total_h = (c.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])\
           - d.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])).dropna(axis=0)\
           ['impression'].apply(lambda x:x**2).sum()
if total_h == 0:
    raise Exception("Total KPI does not change!")
f_sum, r_sum = 0, 0
#         for cause in root_cause.items():
#             if cause[1] != []:
#                 f_sum += f[f[cause[0]].isin(cause[1])][self.target].sum()
#                 r_sum += real[real[cause[0]].isin(cause[1])][self.target].sum()
mask_f = pd.Series([True] * len(f))
mask_r = pd.Series([True] * len(real))
for cause in root_cause.items():
    if cause[1] != []:
        mask_f = mask_f & f[cause[0]].isin(cause[1])
        mask_r = mask_r & real[cause[0]].isin(cause[1])
f_sum = f.loc[mask_f]['impression'].sum()
r_sum = c.loc[mask_r]['impression'].sum()

In [None]:
list(testdata[0:10]['app_name'])

In [None]:
mask = ~a.country.isin(list(testdata[0:10].app_name))

In [None]:
a.loc[mask, 'app_name'] = 'other'

In [None]:
a

In [None]:
a[a.country=='other'].impression.sum() / a.impression.sum()

In [None]:
for attr in ['app_name', 'country', 'id_type', 'platform', 'tu']:
    rank = (a['impression'].groupby(a[attr]).sum().reset_index().set_index(attr) -\
                b['impression'].groupby(b[attr]).sum().reset_index().set_index(attr)) \
                .reset_index().sort_values(by='impression',ascending = False)
    mask = ~a[attr].isin(list(rank[0:10][attr]))
    a.loc[mask, attr] = 'other'

In [None]:
a

In [45]:
def groupToOthers(forecast, real, topK=5):
    for attr in ['app_name', 'country', 'id_type', 'platform', 'tu']:
        rank = forecast['impression'].groupby(forecast[attr]).sum().sort_values(ascending=False).reset_index()
        mask_forecast = ~forecast[attr].isin(list(rank[0:topK][attr]))
        mask_real = ~real[attr].isin(list(rank[0:topK][attr]))
        if attr != 'tu':
            forecast.loc[mask_forecast, attr] = 'other '+attr
            real.loc[mask_real, attr] = 'other '+attr
        else:
            forecast.loc[mask_forecast, attr] = 0
            real.loc[mask_real, attr] = 0
    forecast = forecast.groupby(by=['app_name','country','id_type','platform','tu']).sum().reset_index()
    real = real.groupby(by=['app_name','country','id_type','platform','tu']).sum().reset_index()
    return forecast, real

In [46]:
c, d =groupToOthers(a, b, topK=5)

In [38]:
(a.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])-b.set_index(['app_name', 'country', 'id_type', 'platform', 'tu'])).impression.sum()
#.dropna(axis=0)['impression'].apply(lambda x:x**2).sum()

74690202.0

In [56]:
a.impression.sum() - b.impression.sum()

139448073

In [63]:
a[a.platform.isin(['admob','adx'])][a.tu==2338].impression.sum()#.impression.sum() - b[b.app_name=='skin'][b.tu==2338].impression.sum()

  """Entry point for launching an IPython kernel.


88828232

In [64]:
88828232 / 139448073

0.6369986338929187