In [1]:
import numpy as np
import pandas as pd
import miceforest as mf
import datetime
from pyathena import connect

from sklearn.datasets import fetch_california_housing

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset

from evidently.test_suite import TestSuite
from evidently.test_preset import DataQualityTestPreset, DataStabilityTestPreset
from evidently.tests import *

from ml_lib.feature_store import configure_offline_feature_store
from ml_lib.feature_store.offline.client import FeatureStoreOfflineClient

configure_offline_feature_store(workgroup="primary")

In [2]:
from queries import distribution_dataset_query, distribution_dataset_query_v02, distribution_dataset_query_v03

In [3]:
# parameters
company = "century-games-ncmgu"
project = "idle-mafia-ecbqb"
offer_type = "popup"

# constants
correlation_limit = 0.95

In [4]:
# agg_data = True -> event data will be aggregated over given time period
# agg_data = False -> raw event data will be used
agg_data = True
# selection_method = 0 -> no data selection will be applied
# selection_method = 1 -> stratified sample will be done, where sampling ratio has to be specified
# selection_method = 2 -> data filtering, where only defined views per every buy will be left
selection_method = 0
# number of days to include in the data
number_days = 15
rolling_window = 7
params = {
    "number_of_days": number_days,
    "meta_company": company,
    "meta_project": project,
    "offer_type": offer_type,
    "sum_window": rolling_window,
}

In [5]:
data = fetch_california_housing(as_frame=True)
housing_data = data.frame

In [21]:
housing_data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [22]:
housing_data.rename(columns={'MedHouseVal': 'target'}, inplace=True)
housing_data['prediction'] = housing_data['target'].values + np.random.normal(0, 5, housing_data.shape[0])

In [23]:
housing_data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target,prediction
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526,2.551777
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585,3.7572
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521,2.089179
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,17.665064
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,-1.259625


In [24]:
reference = housing_data.sample(n=5000, replace=False)
current = housing_data.sample(n=5000, replace=False)

In [25]:
#drift_report = Report(metrics=[DataDriftPreset(), TargetDriftPreset()])
drift_report = Report(metrics=[DataDriftPreset()])
 
drift_report.run(reference_data=reference, current_data=current)
drift_report.save_html("test_report.html")

# Test on IM data

In [6]:
dataset = FeatureStoreOfflineClient.run_athena_query_pandas(distribution_dataset_query_v02, params)

Missing STATSD_HOST and/or STATSD_PORT environment variables
DummyStatsClient._send unknown.athena_query_runs_total:1|c
DummyStatsClient._send unknown.athena_query_queue_ms:203.000000|ms
DummyStatsClient._send unknown.athena_query_execution_ms:17857.000000|ms
DummyStatsClient._send unknown.athena_query_scanned_bytes:1346489997|c


In [7]:
dataset['ispaiduser'] = dataset['ispaiduser'].astype("category")

In [8]:
pd.set_option('display.max_columns', None)
#pd.set_option("display.max_rows", None)
dataset

Unnamed: 0,account_id,test_group,cohort_group,meta_date,lifetime_days,streetid,screen_width,screen_height,device_model,country_code,manufacturer,network_type,lib,device,os,level,ispaiduser,app_version,lang,os_version,time_zone,n_logins,n_ad_reward_claims,sum_n_ad_reward_claims,n_ad_reward_fails,sum_n_ad_reward_fails,n_ads_watched,sum_n_ads_watched,n_battlepass_lvls_finished,sum_n_battlepass_lvls_finished,sum_capo_cards_collected,sum_sum_capo_cards_collected,n_times_capo_cards_collected,sum_n_times_capo_cards_collected,sum_capo_cards_spent,sum_sum_capo_cards_spent,n_times_capo_cards_spent,sum_n_times_capo_cards_spent,n_capo_rankup,sum_n_capo_rankup,n_capo_resurect_buys,sum_n_capo_resurect_buys,n_capo_upgrades,sum_n_capo_upgrades,key1_get_total,sum_key1_get_total,key2_get_total,sum_key2_get_total,key3_get_total,sum_key3_get_total,diamond_get_total,sum_diamond_get_total,key1_spent,sum_key1_spent,key2_spent,sum_key2_spent,key3_spent,sum_key3_spent,diamond_spent,sum_diamond_spent,n_dungeon_milestones_achieved,sum_n_dungeon_milestones_achieved,n_instant_awards_claims,sum_n_instant_awards_claims,n_extra_challenge_buys,sum_n_extra_challenge_buys,n_boss_fights,sum_n_boss_fights,sum_boss_damage,sum_sum_boss_damage,n_instant_cigar_buys,sum_n_instant_cigar_buys,sum_instant_cigar_buys,sum_sum_instant_cigar_buys,n_jailbreak_fight_ends,sum_n_jailbreak_fight_ends,n_package_info_offers_viewed,sum_n_package_info_offers_viewed,sum_payments_package_key,sum_sum_payments_package_key,n_payments_package_key,sum_n_payments_package_key,n_package_tips_offers_viewed,sum_n_package_tips_offers_viewed,n_sessions_ended,sum_n_sessions_ended,total_session_duration,sum_total_session_duration,max_session_end_player_level,sum_max_session_end_player_level,n_sessions_started,sum_n_sessions_started,min_session_start_player_level,sum_min_session_start_player_level
0,185967330,personalized,19000101-20220605,2022-10-31,424,10008,768.0,1024.0,"iPad6,11",GB,Apple,WIFI,iOS,"iPad6,11",iOS,88.0,True,6.1.0,en-GB,14.7.1,+00:00,5,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0
1,138075822,control,19000101-20220605,2022-10-31,424,81,390.0,844.0,"iPhone14,2",NL,Apple,WIFI,iOS,"iPhone14,2",iOS,81.0,True,6.1.0,nl-NL,16.0,+01:00,10,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0
2,178345290,control,19000101-20220605,2022-10-31,449,10008,375.0,667.0,"iPhone10,4",FR,Apple,4G,iOS,"iPhone10,4",iOS,81.0,False,6.1.0,fr-FR,15.3.1,+01:00,2,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0
3,271265012,personalized,20221021-99991231,2022-10-31,8,16,428.0,926.0,"iPhone14,3",US,Apple,WIFI,iOS,"iPhone14,3",iOS,16.0,False,6.1.0,en-US,16.1,-04:00,2,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0
4,319570560,personalized,20221021-99991231,2022-10-31,8,10,428.0,926.0,"iPhone13,4",US,Apple,WIFI,iOS,"iPhone13,4",iOS,9.0,False,6.1.0,en-US,15.7.1,-05:00,5,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89787,204589850,personalized,19000101-20220605,2022-10-31,363,117,428.0,926.0,"iPhone13,4",JP,Apple,WIFI,iOS,"iPhone13,4",iOS,117.0,True,6.1.0,ja-JP,15.6.1,+09:00,5,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0
89788,217168510,personalized,19000101-20220605,2022-10-31,313,10008,1080.0,2400.0,M2007J20CG,TH,Xiaomi,WIFI,Android,M2007J20CG,android,37.0,False,6.1.0,th,12,+07:00,2,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0
89789,120960742,personalized,19000101-20220605,2022-10-31,491,10008,390.0,844.0,"iPhone13,2",JP,Apple,5G,iOS,"iPhone13,2",iOS,52.0,True,6.1.0,ja-JP,15.6.1,+09:00,10,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0
89790,59715182,control,19000101-20220605,2022-10-31,906,131,1200.0,1920.0,SM-T830,DE,samsung,WIFI,Android,SM-T830,android,131.0,False,6.1.0,de,10,+01:00,4,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0


In [9]:
dataset_clean = dataset.dropna()
dataset_clean = dataset_clean.iloc[:5000,]
int64_cols = list(dataset_clean.select_dtypes(include='Int64'))
dataset_clean[int64_cols] = dataset_clean[int64_cols].astype('int32')

In [10]:
dataset_clean.groupby(['test_group', 'cohort_group']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,account_id,meta_date,lifetime_days,streetid,screen_width,screen_height,device_model,country_code,manufacturer,network_type,lib,device,os,level,ispaiduser,app_version,lang,os_version,time_zone,n_logins,n_ad_reward_claims,sum_n_ad_reward_claims,n_ad_reward_fails,sum_n_ad_reward_fails,n_ads_watched,sum_n_ads_watched,n_battlepass_lvls_finished,sum_n_battlepass_lvls_finished,sum_capo_cards_collected,sum_sum_capo_cards_collected,n_times_capo_cards_collected,sum_n_times_capo_cards_collected,sum_capo_cards_spent,sum_sum_capo_cards_spent,n_times_capo_cards_spent,sum_n_times_capo_cards_spent,n_capo_rankup,sum_n_capo_rankup,n_capo_resurect_buys,sum_n_capo_resurect_buys,n_capo_upgrades,sum_n_capo_upgrades,key1_get_total,sum_key1_get_total,key2_get_total,sum_key2_get_total,key3_get_total,sum_key3_get_total,diamond_get_total,sum_diamond_get_total,key1_spent,sum_key1_spent,key2_spent,sum_key2_spent,key3_spent,sum_key3_spent,diamond_spent,sum_diamond_spent,n_dungeon_milestones_achieved,sum_n_dungeon_milestones_achieved,n_instant_awards_claims,sum_n_instant_awards_claims,n_extra_challenge_buys,sum_n_extra_challenge_buys,n_boss_fights,sum_n_boss_fights,sum_boss_damage,sum_sum_boss_damage,n_instant_cigar_buys,sum_n_instant_cigar_buys,sum_instant_cigar_buys,sum_sum_instant_cigar_buys,n_jailbreak_fight_ends,sum_n_jailbreak_fight_ends,n_package_info_offers_viewed,sum_n_package_info_offers_viewed,sum_payments_package_key,sum_sum_payments_package_key,n_payments_package_key,sum_n_payments_package_key,n_package_tips_offers_viewed,sum_n_package_tips_offers_viewed,n_sessions_ended,sum_n_sessions_ended,total_session_duration,sum_total_session_duration,max_session_end_player_level,sum_max_session_end_player_level,n_sessions_started,sum_n_sessions_started,min_session_start_player_level,sum_min_session_start_player_level
test_group,cohort_group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1
control,19000101-20220605,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939,939
control,20220606-20221020,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370,370
control,20221021-99991231,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242,242
personalized,19000101-20220605,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962,1962
personalized,20220606-20221020,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895,895
personalized,20221021-99991231,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592,592


### Test personalized vs control

In [11]:
# Cohorts till 5.6.
control_group = dataset_clean.query('test_group == "control" and cohort_group == "19000101-20220605"').iloc[:,4:]
personalized_group = dataset_clean.query('test_group == "personalized" and cohort_group == "19000101-20220605"').iloc[:,4:]

In [12]:
# Cohorts from 6.6.-20.10.2022
control_group = dataset_clean.query('test_group == "control" and cohort_group == "20220606-20221020"').iloc[:,4:]
personalized_group = dataset_clean.query('test_group == "personalized" and cohort_group == "20220606-20221020"').iloc[:,4:]

In [13]:
# experiment with one feature - will it detect data drift?
n_rows = personalized_group['sum_n_package_tips_offers_viewed'].count()
personalized_group['sum_n_package_tips_offers_viewed'] = personalized_group['sum_n_package_tips_offers_viewed'] * np.random.normal(scale = 2.5, size = n_rows)

In [14]:
#drift_report = Report(metrics=[DataDriftPreset(), TargetDriftPreset()])
drift_report = Report(metrics=[DataDriftPreset()])

drift_report.run(reference_data = control_group, current_data = personalized_group)
drift_report.save_html("ab_distributions_report_v02.html")

  terms = (f_obs_float - f_exp)**2 / f_exp
  terms = (f_obs_float - f_exp)**2 / f_exp


FileNotFoundError: [Errno 2] No such file or directory: '/Users/PeterNovak/Desktop/ab-testing/evidently/nbextension/static/index.js'

# Nn

In [15]:
dataset = FeatureStoreOfflineClient.run_athena_query_pandas(distribution_dataset_query_v03, params)

DummyStatsClient._send unknown.athena_query_runs_total:1|c


InvalidArgumentValue: Exception parsing query. Root error message: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 78:5: mismatched input 'FROM'. Expecting: '*', <expression>, <identifier>

In [None]:
dataset