In [None]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

In [None]:
def normalize(pre_df, post_df):
    merged_df = pd.merge(pre_df, post_df, on='Model')

    avg_cols = merged_df.iloc[:, 1:4].mean(axis=1) 
    merged_df.iloc[:, 1:] = merged_df.iloc[:, 1:].div(avg_cols, axis=0)

    df = pd.melt(
        merged_df,
        id_vars=['Model'],
        value_vars=merged_df.columns.to_list()[1:],
        var_name='week',
        value_name='Normalized Downloads'
    )

    df['week'] = pd.to_datetime(df['week'])

    return df


def two_bin(df, MERGE_DATE, on='Model'):
    df_treatment = df[df['Treatment'] == 'Treatment']
    df_control = df[df['Treatment'] == 'Control']
    delta_treatment = []
    delta_control = []

    for model in df[on].unique():
        df_model_treatment = df_treatment[df_treatment[on] == model]
        df_model_control = df_control[df_control[on] == model]

        avg_treatment_before = df_model_treatment.loc[df_model_treatment['week'] < MERGE_DATE, 'Normalized Downloads'].mean()
        avg_treatment_after = df_model_treatment.loc[df_model_treatment['week'] >= MERGE_DATE, 'Normalized Downloads'].mean()
        avg_control_before = df_model_control.loc[df_model_control['week'] < MERGE_DATE, 'Normalized Downloads'].mean()
        avg_control_after = df_model_control.loc[df_model_control['week'] >= MERGE_DATE, 'Normalized Downloads'].mean()

        if not np.isnan(avg_treatment_after - avg_treatment_before):
            delta_treatment.append(avg_treatment_after - avg_treatment_before)
        elif not np.isnan(avg_control_after - avg_control_before):
            delta_control.append(avg_control_after - avg_control_before)
        else:
            print(f"ERROR: Both change in downloads are NaN for {model}")

    t_stat, p_val = ttest_ind(delta_treatment, delta_control, equal_var=False, alternative='greater')

    def create_plot():
        treatment_df = pd.DataFrame({'delta': delta_treatment, 'group': 'Treatment'})
        control_df = pd.DataFrame({'delta': delta_control, 'group': 'Control'})

        combined_df = pd.concat([treatment_df, control_df], ignore_index=True)

        plt.figure(figsize=(3, 4))
        sns.set(style="whitegrid")
        ax = sns.boxplot(
            x='group',
            y='delta',
            data=combined_df,
            palette=['maroon', 'lightgrey'],
            width=0.5
        )

        sns.despine(left=True)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.tight_layout()

        ax.set_xlabel('')
        ax.set_ylabel('Weekly Downloads Change (%)')
        ax.yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{:,.0%}'.format(x)))
        plt.show()

    create_plot()
    
    mean_diff = np.mean(delta_treatment) - np.mean(delta_control)
    std_diff = np.std(delta_treatment + delta_control)
    effect_size = mean_diff / std_diff

    print(f"Mean change in downloads for treatment group: {np.mean(delta_treatment):.4f}")
    print(f"Mean change in downloads for control group: {np.mean(delta_control):.4f}")
    print(f"Standard deviation of change in downloads: {std_diff:.4f}")
    print(f"P-value: {p_val:.4f}")
    print(f"Effect size: {effect_size:.4f}")

    return

In [None]:
with open('batch1_data.pkl', 'rb') as f:
    all_data_dict = pickle.load(f)
df_treatment = normalize(all_data_dict['pre_treatment']['treatment_group'],\
                             all_data_dict['post_treatment']['treatment_group'])
df_treatment['Treatment'] = 'Treatment'

df_control = normalize(all_data_dict['pre_treatment']['control_group'], \
                           all_data_dict['post_treatment']['control_group'])
df_control['Treatment'] = 'Control'
df_short_term_batch_1 = pd.concat([df_treatment, df_control], ignore_index=True) 
df_short_term_batch_1['MERGE_DATE'] = all_data_dict['MERGE_DATE']

In [None]:
df_short_term_batch_1.head(10)

In [None]:
len(np.unique(df_short_term_batch_1[df_short_term_batch_1['Treatment'] == 'Treatment'].Model))

In [None]:
dir_data = '/Users/yiquntchen/Desktop/winter-2023/HF-RCT-Model-Card-Intervention-Study/analysis_YC/'
df_short_term_batch_1.to_csv(dir_data+'df_short_term_batch_1_norm_first_week.csv')

In [None]:
with open('batch2_data.pkl', 'rb') as f:
    all_data_dict = pickle.load(f)
df_treatment = normalize(all_data_dict['pre_treatment']['treatment_group'],\
                             all_data_dict['post_treatment']['treatment_group'])
df_treatment['Treatment'] = 'Treatment'

df_control = normalize(all_data_dict['pre_treatment']['control_group'], \
                           all_data_dict['post_treatment']['control_group'])
df_control['Treatment'] = 'Control'
df_short_term_batch_2 = pd.concat([df_treatment, df_control], ignore_index=True)      
df_short_term_batch_2['MERGE_DATE'] = all_data_dict['MERGE_DATE']

df_short_term_batch_2.to_csv(dir_data+'df_short_term_batch_2_norm_first_week.csv')

In [None]:
len(np.unique(df_short_term_batch_2[df_short_term_batch_2['Treatment'] == 'Treatment'].Model))