#  k identical reports threshold impact on reporting 

## Prologue

Below, you will find a form of pseudo code we used to pull the data underlying the blog post Criteo published about k-thresholding.

The raw data we base our analysis upon is stored in one Hadoop table (`logs.displays`) with the following structure:

| field name | data type | description |
|------------|---------- |-------------|
| day | DATE | Date at which the displays was printed |
| environment | STRING | environment of the display (web, app, etc.) |
| advertiser | STRING | Advertiser name (representing the interest group in this context) |
| publisher | STRING | Publisher name |
| displays | BIGINT | Number of displays |


## Setup

In [None]:
# import querying_package
import os
import pandas as pd
import math
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Publisher Side Reporting

Pseudo query:

In [None]:
query = f"""
    SELECT
        d.advertiser AS advertiser
        , d.publisher AS publisher
        , SUM(d.displays)/(1 + day(date '2020-06-17' - date '2020-06-11')) as nb_display
    FROM
        logs.displays d
    WHERE
        d.day BETWEEN '2020-06-11' AND '2020-06-17'
        AND d.environment = 'web'
        AND d.publisher IN ('my_major_international_publisher', 'my_medium_publisher', 'my_small_publisher', 'my_extra_small_publisher')
    GROUP BY
        d.advertiser
        , d.publisher
"""

We pull the data

In [None]:
data_path = "./data/20200630-publisher-side-obfuscated.csv"

csv_exists = os.path.isfile(data_path)
if csv_exists:
    df = pd.read_csv(data_path) 
else:
    df = querying_package.run_presto_query(query, output=data_path)

We use the log scale for better visualization.

In [None]:
df['nb_display'] = df['nb_display'].astype('float')

### Share of unreported advertisers

Aggregate the data

In [None]:
tmp = df.groupby(['publisher', 'nb_display'])[['advertiser']].agg('count').sort_index()
tmp['n_advertisers_cum_sum'] = tmp.groupby(level=['publisher']).cumsum()
tmp['n_advertisers_running_pct'] = tmp.groupby(level=['publisher'])['n_advertisers_cum_sum'].transform(lambda x: x / x.iloc[-1])
tmp = tmp.reset_index()

Plot the data

In [None]:
f, ax = plt.subplots(figsize=(18, 9))
ax.grid(False)
sns.lineplot(data=tmp
             , x='nb_display'
             , y='n_advertisers_running_pct'
             , hue='publisher', hue_order = ['Major International Publisher', 'Medium Publisher', 'Small Publisher', 'Extra-Small Publisher']
             , linewidth=3)
plt.xlabel('k threshold', fontsize=16)
plt.xscale('log')
plt.ylabel('Share of unreported advertisers', fontsize=16)
ax.set(ylim=(-0.1, 1.1))
plt.title('Share of unreported advertisers in daily reporting with k-thresholding', fontsize=20)
plt.gca().yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[1:], labels=labels[1:], prop={'size': 16})

plt.axvline(9, linestyle='--', color='c', linewidth=1)
plt.text(9+0.5, 0.8, f"k = 10"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(99, linestyle='--', color='c', linewidth=1)
plt.text(99+5, 0.8, f"k = 100"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(999, linestyle='--', color='c', linewidth=1)
plt.text(999+50, 0.8, f"k = 1000"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(9999, linestyle='--', color='c', linewidth=1)
plt.text(9999+500, 0.8, f"k = 10000"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.savefig(f'./output/20200630-publisher-report-nadvertisers.png', bbox_inches='tight')

### Share of unreported displays

Aggregate the data

In [None]:
df['nb_display_dim'] = df['nb_display']

In [None]:
tmp = df.groupby(['publisher', 'nb_display_dim'])[['nb_display']].agg('sum').sort_index()
tmp['n_displays_cum_sum'] = tmp.groupby(level=['publisher']).cumsum()
tmp['n_displays_running_pct'] = tmp.groupby(level=['publisher'])['n_displays_cum_sum'].transform(lambda x: x / x.iloc[-1])
tmp = tmp.reset_index()

Plot the data

In [None]:
f, ax = plt.subplots(figsize=(18, 9))
ax.grid(False)
sns.lineplot(data=tmp
             , x='nb_display_dim'
             , y='n_displays_running_pct'
             , hue='publisher', hue_order = ['Major International Publisher', 'Medium Publisher', 'Small Publisher', 'Extra-Small Publisher']
             , linewidth=3)
plt.xlabel('k threshold', fontsize=16)
plt.xscale('log')
plt.ylabel('Share of unreported displays', fontsize=16)
ax.set(ylim=(-0.1, 1.1))
plt.title('Share of unreported displays in daily reporting with k-thresholding', fontsize=20)
plt.gca().yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[1:], labels=labels[1:], prop={'size': 16})

plt.axvline(9, linestyle='--', color='c', linewidth=1)
plt.text(9+0.5, 0.8, f"k = 10"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(99, linestyle='--', color='c', linewidth=1)
plt.text(99+5, 0.8, f"k = 100"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(999, linestyle='--', color='c', linewidth=1)
plt.text(999+50, 0.8, f"k = 1000"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(9999, linestyle='--', color='c', linewidth=1)
plt.text(9999+500, 0.8, f"k = 10000"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.savefig(f'./output/20200630-publisher-report-ndisplays.png', bbox_inches='tight')

## Advertiser Side Reporting

In [None]:
query = f"""
    SELECT
        d.advertiser AS advertiser
        , d.publisher AS publisher
        , SUM(d.displays)/(1 + day(date '2020-06-17' - date '2020-06-11')) as nb_display
    FROM
        logs.displays d
    WHERE
        d.day BETWEEN '2020-06-11' AND '2020-06-17'
        AND d.environment = 'web'
        AND d.partner_id IN ('my_big_advertiser', 'my_medium_advertiser', 'my_small_advertiser')
    GROUP BY
        d.advertiser
        , d.publisher
"""

In [None]:
data_path = "./data/20200630-advertiser-side-obfuscated.csv"

csv_exists = os.path.isfile(data_path)
if csv_exists:
    df = pd.read_csv(data_path) 
else:
    df = querying_package.run_presto_query(query, output=data_path)

In [None]:
df['nb_display'] = df['nb_display'].astype('float')

### Share of unreported publishers

Aggregate the data

In [None]:
tmp = df.groupby(['advertiser', 'nb_display'])[['publisher']].agg('count').sort_index()
tmp['n_publishers_cum_sum'] = tmp.groupby(level=['advertiser']).cumsum()
tmp['n_publishers_running_pct'] = tmp.groupby(level=['advertiser'])['n_publishers_cum_sum'].transform(lambda x: x / x.iloc[-1])
tmp = tmp.reset_index()

Plot the data

In [None]:
f, ax = plt.subplots(figsize=(18, 9))
ax.grid(False)
sns.lineplot(data=tmp
             , x='nb_display'
             , y='n_publishers_running_pct'
             , hue='advertiser', hue_order = ['Big Advertiser', 'Medium Advertiser', 'Small Advertiser']
             , linewidth=3)
plt.xlabel('k threshold', fontsize=16)
plt.xscale('log')
plt.ylabel('Share of unreported publishers', fontsize=16)
ax.set(ylim=(-0.1, 1.1))
plt.title('Share of unreported publishers in daily reporting with k-thresholding', fontsize=20)
plt.gca().yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[1:], labels=labels[1:], prop={'size': 16})

plt.axvline(1, linestyle='--', color='c', linewidth=1)
plt.text(1+0.05, 0.2, f"k = 2"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(9, linestyle='--', color='c', linewidth=1)
plt.text(9+0.5, 0.2, f"k = 10"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(99, linestyle='--', color='c', linewidth=1)
plt.text(99+5, 0.8, f"k = 100"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(999, linestyle='--', color='c', linewidth=1)
plt.text(999+50, 0.8, f"k = 1000"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(9999, linestyle='--', color='c', linewidth=1)
plt.text(9999+500, 0.8, f"k = 10000"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.savefig(f'./output/20200630-advertiser-report-npublishers.png', bbox_inches='tight')

### Share of unreported displays

Aggregate the data

In [None]:
df['nb_display_dim'] = df['nb_display']

In [None]:
tmp = df.groupby(['advertiser', 'nb_display_dim'])[['nb_display']].agg('sum').sort_index()
tmp['n_displays_cum_sum'] = tmp.groupby(level=['advertiser']).cumsum()
tmp['n_displays_running_pct'] = tmp.groupby(level=['advertiser'])['n_displays_cum_sum'].transform(lambda x: x / x.iloc[-1])
tmp = tmp.reset_index()

Plot the data

In [None]:
f, ax = plt.subplots(figsize=(18, 9))
ax.grid(False)
sns.lineplot(data=tmp
             , x='nb_display_dim'
             , y='n_displays_running_pct'
             , hue='advertiser', hue_order = ['Big Advertiser', 'Medium Advertiser', 'Small Advertiser']
             , linewidth=3)
plt.xlabel('k threshold', fontsize=16)
plt.xscale('log')
plt.ylabel('Share of unreported displays', fontsize=16)
ax.set(ylim=(-0.1, 1.1))
plt.title('Share of unreported publishers in daily reporting with k-thresholding', fontsize=20)
plt.gca().yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[1:], labels=labels[1:], prop={'size': 16})

plt.axvline(9, linestyle='--', color='c', linewidth=1)
plt.text(9+0.5, 0.8, f"k = 10"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(99, linestyle='--', color='c', linewidth=1)
plt.text(99+5, 0.8, f"k = 100"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(999, linestyle='--', color='c', linewidth=1)
plt.text(999+50, 0.8, f"k = 1000"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.axvline(9999, linestyle='--', color='c', linewidth=1)
plt.text(9999+500, 0.8, f"k = 10000"
         , bbox={'facecolor':'c', 'edgecolor': 'c', 'pad':4, 'alpha': 0.7}, zorder=12, fontsize=16)

plt.savefig(f'./output/20200630-advertiser-report-ndisplays.png', bbox_inches='tight')