In [None]:
## Notice that the following takes quite a long time to install. 
## It will throw warnings due to dependency issues, but they can be ignored if run on JupyterHub

# %pip install -qq SyntheticControlMethods

In [None]:
import pydata_google_auth
import pandas_gbq
import pandas as pd
!pip install SyntheticControlMethods --quiet 

In [None]:
PROJECT_ID = 'uw-data-platform-prod'

auth = pydata_google_auth.get_user_credentials(scopes=["https://www.googleapis.com/auth/bigquery"])

In [None]:
ids_q = """SELECT DISTINCT partner_position_id FROM 
`uw-partner-prod.metrics_analysis.partner_position_master_data_daily` 
"""

In [None]:
observed_features = ['jtc_applications_last_3m', 'articles_unique_days_viewed_last_1m',
                     'prospects_unique_days_viewed_last_1m', 'learning_plans_started_last_1m',
                    'days_logged_into_portal_last_3m', "customers_gathered_last_1m"]

_id = ['partner_position_id', 'snapshot_date', 'avg_monthly_ri_earned_last_1y']

In [None]:
## A copy of this file can be downloaded from 
## https://drive.google.com/file/d/1VWyqF_lzN3ABcsyrkTF-HayiUO3F_Gsr/view?usp=sharing
population = pd.read_csv('../dashboard_experiment.csv')
population_ids = population.partner_position_id.tolist()

In [None]:
_id_list = "', '".join(population_ids)
ids_pattern = "" if _id_list is None else f"WHERE partner_position_id IN ('{_id_list}')"
q_pattern = "SELECT {} FROM `uw-partner-prod.metrics_analysis.partner_position_master_data_daily` {} ORDER BY 1, 2"
features_query = q_pattern.format(' ,'.join(_id + observed_features), ids_pattern)


In [None]:
try:
    resp_df = pd.read_gbq(features_query, PROJECT_ID, credentials=auth, use_bqstorage_api=True )
except ValueError:
    resp_df = pd.read_gbq(features_query, 'uw-data-warehouse-prod', credentials=auth, use_bqstorage_api=True )

In [None]:
grouped_df = resp_df.set_index('partner_position_id').join(population.set_index('partner_position_id')
                                              ['is_test'], how='inner').groupby(['is_test', 'snapshot_date']).mean()

In [None]:
population['experiment_group'] = 'None'
population.loc[population.is_test, 'experiment_group'] = 'Test'
population.loc[population.is_control, 'experiment_group'] = 'Control'

In [None]:
data_df = resp_df.set_index('partner_position_id').join(population.set_index('partner_position_id')
                                              [['is_test', 'is_control', 'experiment_group']], how='inner')

In [None]:
data_df['has_gathered_last_1m'] = data_df['customers_gathered_last_1m'] > 0

In [None]:
columns = ['snapshot_date', 'avg_monthly_ri_earned_last_1y',
       'jtc_applications_last_3m', 'articles_unique_days_viewed_last_1m',
       'prospects_unique_days_viewed_last_1m',
       'learning_plans_started_last_1m', 'days_logged_into_portal_last_3m',
       'customers_gathered_last_1m', 'has_gathered_last_1m']

In [None]:
#This file can be downloaded from 
### https://drive.google.com/file/d/10ly8E4VUwillgW-HamvElkfVv4g1pKuP/view?usp=sharing

import pickle
with open('control_ids.pkl', 'rb') as f:
    control_ids = pickle.load(f)

In [None]:
N_controls = 10
controls_size = 40
RS = 118
control_group = data_df.loc[data_df.is_control, columns].reset_index()
# control_ids = (control_group
#                .loc[control_group.snapshot_date == '2021-03-09', 'partner_position_id']
#                .drop_duplicates()
#                .sample(controls_size * N_controls, random_state=RS)
#                .tolist())
small_controls = [control_ids[i * controls_size: (i + 1) * controls_size] for i in range(N_controls)]
controls = []
for i, g in enumerate(small_controls):
    cg = control_group[control_group.partner_position_id.isin(g)].reset_index(drop=True).copy()
    cg['partner_position_id'] = f'CONTROL{i}'
    controls.append(cg.set_index('partner_position_id'))
control_group = pd.concat(controls).groupby(['partner_position_id', 'snapshot_date']).mean().reset_index()

In [None]:
data_df['avg_monthly_ri_earned_last_1y'] = data_df['avg_monthly_ri_earned_last_1y'].astype(float)
treatment_group = data_df.loc[data_df.is_test, columns].groupby('snapshot_date').mean().reset_index()
treatment_group['partner_position_id'] = 'TREATMENT'

In [None]:
data_prepped = pd.concat([treatment_group.set_index('partner_position_id'), control_group.set_index('partner_position_id')]).reset_index()

In [None]:
data_prepped['snapshot_date'] = (data_prepped.snapshot_date - pd.to_datetime('2021-03-11')).dt.days

In [None]:
import SyntheticControlMethods as scm

In [None]:
import matplotlib.pyplot as plt

plt.style.use('seaborn')

_df = data_prepped.set_index('snapshot_date')

for f in _df.columns[1:-2]:
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.set_title(f)
    for pid in sorted(_df.partner_position_id.unique())[-5:]:
        _df.loc[_df.partner_position_id == pid, f].plot(ax=ax, label=pid);
    plt.legend();

In [None]:
sc = scm.Synth(data_prepped, 
               "prospects_unique_days_viewed_last_1m", 
               "partner_position_id", "snapshot_date", 1, "TREATMENT", pen=0)
sc.plot(["original", "pointwise"], treated_label="Test Group", 
            synth_label="Synthetic Control", treatment_label="Intervention")

In [None]:
sc = scm.Synth(data_prepped, 
               "articles_unique_days_viewed_last_1m", 
               "partner_position_id", "snapshot_date", 1, "TREATMENT", pen=0)
sc.plot(["original", "pointwise"], treated_label="Test Group", 
            synth_label="Synthetic Control", treatment_label="Intervention")

In [None]:
sc = scm.Synth(data_prepped, 
               "customers_gathered_last_1m", 
               "partner_position_id", "snapshot_date", 1, "TREATMENT", pen=0)
sc.plot(["original", "pointwise"], treated_label="Test Group", 
            synth_label="Synthetic Control", treatment_label="Intervention")

In [None]:
sc = scm.Synth(data_prepped, 
               "has_gathered_last_1m", 
               "partner_position_id", "snapshot_date", 1, "TREATMENT", pen=0)
sc.plot(["original", "pointwise"], treated_label="Test Group", 
            synth_label="Synthetic Control", treatment_label="Intervention")