<font size="3"> Анализ эксперимента HQ vs Antifraud </font>
<br> <a href="https://docs.google.com/spreadsheets/d/14yqejqrCe4H1WuczPXyxnCKLIPP5MdQy/edit?gid=667110603#gid=667110603">Отчет</a> 

#### Подключение к базе

In [1]:
import os
import pandas as pd
import numpy as np
import sqlalchemy as sql
from sqlalchemy import create_engine

In [2]:
db_config = {
'host': host,
'port' : 3306,
'db' : 'db',
'user' : os.getenv('MYSQL_USER'),
'pass' : os.getenv('MYSQL_PASSWORD'),
}
 
# connection_string = "mysql+pymysql://<dbuser>:<pwd>@<server_ip>/<db_instance>"
connection_string = 'mysql+pymysql://{}:{}@{}/{}'.format(
    db_config['user'],
    db_config['pass'],
    db_config['host'],
    db_config['db'],
)

In [3]:
engine = create_engine(connection_string)

In [4]:
def select(sql):
  return pd.read_sql(sql, con=engine)

#### Sql-запросы

##### Для списка кампаний, учавствовавших в эксперименте

In [9]:
sql = '''
select id
FROM (SELECT DISTINCT campaigns.id,
an.`name`,
category_groups.name as category_group
FROM campaigns
JOIN campaign_statistics cs ON campaigns.id = cs.campaign_id
LEFT JOIN category_group_targets on campaigns.id = category_group_targets.targetable_id AND category_group_targets.targetable_type LIKE '%%Campaign'
LEFT JOIN category_group_category_group_target on category_group_targets.id = category_group_category_group_target.category_group_target_id
LEFT JOIN category_groups on category_groups.id = category_group_category_group_target.category_group_id
LEFT JOIN users u ON campaigns.user_id = u.id
LEFT JOIN ad_networks an ON u.ad_network_id = an.id
WHERE category_group_targets.deleted_at IS NULL
AND campaigns.ad_format = 'popunder'
AND campaigns.created_at >= '2025-04-28'
AND date(cs.date) >= '2025-04-28'
AND u.id NOT IN (SELECT DISTINCT user_id
FROM networks)
AND u.email NOT LIKE '%%onlinesup%%'
AND campaigns.sale_type != 'direct sale'
AND campaigns.id NOT IN (SELECT campaign_id
          FROM antifraud_targets
          WHERE enabled = 1
            AND deleted_at IS NULL
            AND created_at >= '2025-04-28'
      )
HAVING category_group = 'Adult'
AND NAME = 'Clickadilla'
AND campaigns.id NOT IN (SELECT campaign_id
          FROM antifraud_targets
          WHERE enabled = 1
            AND deleted_at IS NULL
            AND created_at >= '2025-04-28'
      )) AS t
      '''

In [12]:
df_id = select(sql)

In [114]:
import clickhouse_connect

client = clickhouse_connect.get_client(
    host='host',
    port=8123,
    username='name',
    password='pass'
)

##### Подключение к Кликхаусу

In [13]:
id_list = df_id['id'].tolist()

In [142]:
# функция для обработки списка из 18000 айди и фильтра для другого запроса

In [16]:
# Чанки по 1000 ID
chunk_size = 1000
id_chunks = [id_list[i:i+chunk_size] for i in range(0, len(id_list), chunk_size)]

results = []

# Запросы по чанкам
for chunk in id_chunks:
    ids_str = ','.join(str(i) for i in chunk)
    query = f"""
    select clickadilla_campaign_id,
count() as shows,
sum(clickadilla_price/10000) as spend,
(uniq(ip) / count()) * 100 as uniqs,
if(countIf(browser_family in ('chrome','spartan','opera','','yandex'))>1000,(countIf(is_sec=0 and browser_family in ('chrome','spartan','opera','','yandex'))/countIf(browser_family in ('chrome','spartan','opera','','yandex')))*100 , 0) as bots,
(countIf(usage_type in ('CDN', 'Commercial', 'Datacentre/Hosting')) / count()) * 100 as bad_ip
from visitstats.tds_out
prewhere clickadilla_campaign_id in ({ids_str})
where stats_day >= '2025-04-28'
and is_external_ssp != 1
group by clickadilla_campaign_id
    """
    df_part = client.query_df(query)
    results.append(df_part)

# Объединить все части
final_df = pd.concat(results, ignore_index=True)


In [19]:
final_df['shows'] = final_df['shows'].astype('int64')

In [22]:
final_df['spend_true'] = final_df['spend'] * 10

In [24]:
final_df['spend_true']

78175.40822943501

##### Запрос для Онклика

In [39]:
query_on = '''
select id
FROM (SELECT DISTINCT campaigns.id,
an.`name`,
category_groups.name as category_group
FROM campaigns
JOIN campaign_statistics cs ON campaigns.id = cs.campaign_id
LEFT JOIN category_group_targets on campaigns.id = category_group_targets.targetable_id AND category_group_targets.targetable_type LIKE '%%Campaign'
LEFT JOIN category_group_category_group_target on category_group_targets.id = category_group_category_group_target.category_group_target_id
LEFT JOIN category_groups on category_groups.id = category_group_category_group_target.category_group_id
LEFT JOIN users u ON campaigns.user_id = u.id
LEFT JOIN ad_networks an ON u.ad_network_id = an.id
WHERE category_group_targets.deleted_at IS NULL
AND campaigns.ad_format = 'popunder'
AND campaigns.created_at >= '2025-04-28'
AND date(cs.date) >= '2025-04-28'
AND u.id NOT IN (SELECT DISTINCT user_id
FROM networks)
AND u.email NOT LIKE '%%onlinesup%%'
AND campaigns.sale_type != 'direct sale'
AND campaigns.id IN (SELECT campaign_id
          FROM antifraud_targets
          WHERE enabled = 1
            AND deleted_at IS NULL
            AND created_at >= '2025-04-28'
      )
HAVING category_group = 'Adult' AND an.name = 'Onclicka') AS t
'''

In [40]:
df_on = select(query_on)

In [41]:
id_list2 = df_on['id'].tolist()

In [42]:
# Чанки по 1000 ID
chunk_size = 1000
id_chunks = [id_list2[i:i+chunk_size] for i in range(0, len(id_list2), chunk_size)]

results = []

# Запросы по чанкам
for chunk in id_chunks:
    ids_str = ','.join(str(i) for i in chunk)
    query = f"""
    select clickadilla_campaign_id,
count() as shows,
sum(clickadilla_price/1000) as spend,
(uniq(ip) / count()) * 100 as uniqs,
if(countIf(browser_family in ('chrome','spartan','opera','','yandex'))>1000,(countIf(is_sec=0 and browser_family in ('chrome','spartan','opera','','yandex'))/countIf(browser_family in ('chrome','spartan','opera','','yandex')))*100 , 0) as bots,
(countIf(usage_type in ('CDN', 'Commercial', 'Datacentre/Hosting')) / count()) * 100 as bad_ip
from visitstats.tds_out
prewhere clickadilla_campaign_id in ({ids_str})
where stats_day >= '2025-04-28'
and is_external_ssp != 1
group by clickadilla_campaign_id
    """
    df_part = client.query_df(query)
    results.append(df_part)

# Объединить все части
df_on_final = pd.concat(results, ignore_index=True)

In [43]:
df_on_final

Unnamed: 0,clickadilla_campaign_id,shows,spend,uniqs,bots,bad_ip
0,392614,4304,0.60256,95.655204,7.363876,1.881970
1,393640,5408,0.75712,93.047337,10.243902,2.496302
2,391158,30226,3.02260,41.087144,10.606974,4.635082
3,392546,5513,0.77182,97.206603,7.402549,1.251587
4,392295,4638,0.64932,95.709357,8.121374,0.927124
...,...,...,...,...,...,...
3729,405846,4043,0.36387,76.057383,14.061592,6.826614
3730,404649,7030,0.70300,92.574680,15.650768,9.274538
3731,405460,2028,0.18252,96.992110,14.716981,5.571992
3732,404958,2030,0.18270,95.320197,18.950749,7.389163


#### Скоринг качества трафика

In [None]:
# функция для определения качества трафика

In [137]:
def calculate_quality_score(df, thresholds=None):
    """
    Расчёт quality_score_2 с метриками:
    - 'uniqs': more is better
    - 'bad_ip': less is better
    - 'bots': less is better (идеально — 0)
    Используется сглаженное отклонение от порогов:
    - score = 0.5 → на пороге
    - >0.5 → лучше нормы
    - <0.5 → хуже нормы
    """

    if thresholds is None:
        thresholds = {'uniqs': 80, 'bad_ip': 60, 'bots': 5}

    def normalize_metric(value, threshold, direction):
        if value == 0:
            value = 1e-6  # защита от деления на 0

        if direction == 'less_is_better':
            ratio = threshold / value
        elif direction == 'more_is_better':
            ratio = value / threshold
        else:
            return None

        score = (ratio ** 0.5) / 2
        return min(score, 1.0)

    def compute_score(row):
        scores = [
            normalize_metric(row['uniqs'], thresholds['uniqs'], 'more_is_better'),
            normalize_metric(row['bad_ip'], thresholds['bad_ip'], 'less_is_better'),
            1.0 if row['bots'] == 0 else normalize_metric(row['bots'], thresholds['bots'], 'less_is_better')
        ]
        avg_score = sum(scores) / len(scores)
        spend = row['spend']
        good_spend = avg_score * spend
        bad_spend = spend - good_spend
        return pd.Series({
            'quality_score': avg_score,
            'good_spend': good_spend,
            'bad_spend': bad_spend
        })

    scores = df.apply(compute_score, axis=1)
    return pd.concat([df, scores], axis=1)


In [47]:
scored_df_on = calculate_quality_score(df_on_final)

29842.74395081965
11042.104763377492
18800.639187442153


In [134]:
scored_ca = calculate_quality_score(final_df)

# Получаем три новых колонки:
# - quality_score
# - good_spend
# - bad_spend

# Сводка:
print(scored_ca['spend'].sum())
print(scored_ca['good_spend'].sum())
print(scored_ca['bad_spend'].sum())

In [135]:
scored_ca['quality_score'].mean()

0.782882352474988

In [138]:
scored_on = calculate_quality_score(df_on_final)

In [139]:
scored_on['quality_score'].mean()

0.6459969001741136

#### Сравнение средних

In [59]:
from scipy.stats import ttest_ind

In [140]:
# T-test
t_stat, p_val = ttest_ind(scored_ca['quality_score'], scored_on['quality_score'], equal_var=False)

print(f"Group A (CA) mean: {scored_ca['quality_score'].mean():.4f}")
print(f"Group B (ON) mean: {scored_on['quality_score'].mean():.4f}")
print(f"P-value: {p_val:.4f}")
if p_val < 0.05:
    print("Разница статистически значима (p < 0.05)")
else:
    print("Разница НЕ статистически значима")


Group A (CA) mean: 0.7829
Group B (ON) mean: 0.6460
P-value: 0.0000
Разница статистически значима (p < 0.05)


##### Доверительные интервалы

In [68]:
import numpy as np
import scipy.stats as stats

def confidence_interval(series, confidence=0.95):
    n = len(series)
    mean = np.mean(series)
    stderr = stats.sem(series)  # стандартная ошибка среднего
    margin = stderr * stats.t.ppf((1 + confidence) / 2., n - 1)
    return mean, mean - margin, mean + margin

In [141]:
mean_a, ci_low_a, ci_high_a = confidence_interval(scored_ca['quality_score'])
mean_b, ci_low_b, ci_high_b = confidence_interval(scored_on['quality_score'])

print(f"Group A: mean={mean_a:.4f}, 95% CI=({ci_low_a:.4f}, {ci_high_a:.4f})")
print(f"Group B: mean={mean_b:.4f}, 95% CI=({ci_low_b:.4f}, {ci_high_b:.4f})")


Group A: mean=0.7829, 95% CI=(0.7814, 0.7844)
Group B: mean=0.6460, 95% CI=(0.6437, 0.6483)


**Интерпретация:**
Интервалы не пересекаются:

Верхняя граница Group B: 0.3496

Нижняя граница Group A: 0.5303

Разница между группами статистически значима

Среднее значение в Group A существенно выше, чем в Group B

Интервалы узкие → значит, выборки большие и/или разброс значений невелик
Это делает сравнение надёжным и устойчивым

**Вывод:**
Трафик в группе A статистически и практически качественнее, чем в группе B по метрике quality_score

Это не просто «на глаз», это подтверждено:

строгим разделением интервалов,

узкими границами (низкая ошибка),

достаточным числом кампаний.