"""
You might want to test whether the average age of clients engaging with the new process is the same as those engaging with the old process
"""

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as st

%matplotlib inline

In [6]:
# Data Cleaning
file_path_clients = "C:/Users/Srnzzz/Documents/GitHub/vanguard-ab-test/client_old_new_df.csv"
df_clients = pd.read_csv(file_path_clients)
df_clients['date_time'] = pd.to_datetime(df_clients['date_time'])
df_clients_clean = df_clients.dropna()

In [10]:
df_clients_clean

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07,Test
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51,Test
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22,Test
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13,Test
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04,Test
...,...,...,...,...,...,...
417650,1574008,117364417_77840596075,528720790_71583064618_169151,start,2017-05-06 23:43:27,Test
417651,2908510,814969699_90652851448,562606085_36368381773_92090,start,2017-05-10 22:57:17,Control
417652,2908510,814969699_90652851448,562606085_36368381773_92090,step_2,2017-05-10 22:56:31,Control
417653,2908510,814969699_90652851448,562606085_36368381773_92090,step_1,2017-05-10 22:56:23,Control


In [8]:
file_path_sample = "C:/Users/Srnzzz/Documents/GitHub/vanguard-ab-test/new_final_demo.csv"
df_sample = pd.read_csv(file_path_sample)
df_sample

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,age_groups
0,836976,6.0,73.0,60.5,U,2.0,45105.30,6.0,9.0,senior
1,2304905,7.0,94.0,58.0,U,2.0,110860.30,6.0,9.0,adult
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0,adult
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0,adult
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0,adult
...,...,...,...,...,...,...,...,...,...,...
70604,7993686,4.0,56.0,38.5,U,3.0,1411062.68,5.0,5.0,adult
70605,8981690,12.0,148.0,31.0,M,2.0,101867.07,6.0,6.0,adult
70606,333913,16.0,198.0,61.5,F,2.0,40745.00,3.0,3.0,senior
70607,1573142,21.0,255.0,68.0,M,3.0,475114.69,4.0,4.0,senior


In [12]:
df_merged_age = df_clients_clean.merge(df_sample[['client_id', 'clnt_age']], on='client_id', how='left')
df_merged_age

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_age
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07,Test,79.0
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51,Test,79.0
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22,Test,79.0
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13,Test,79.0
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04,Test,79.0
...,...,...,...,...,...,...,...
321304,1574008,117364417_77840596075,528720790_71583064618_169151,start,2017-05-06 23:43:27,Test,55.0
321305,2908510,814969699_90652851448,562606085_36368381773_92090,start,2017-05-10 22:57:17,Control,34.0
321306,2908510,814969699_90652851448,562606085_36368381773_92090,step_2,2017-05-10 22:56:31,Control,34.0
321307,2908510,814969699_90652851448,562606085_36368381773_92090,step_1,2017-05-10 22:56:23,Control,34.0


In [14]:
average_age_by_variation = df_merged_age.groupby('Variation')['clnt_age'].mean().reset_index()

# Rename the columns for clarity
average_age_by_variation.columns = ['Variation', 'Average_Age']

# Display the result
print(average_age_by_variation)

  Variation  Average_Age
0   Control    48.309193
1      Test    48.749240


In [16]:
df_no_test = df_merged_age[df_merged_age['Variation'] != 'Test']
df_no_test

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_age
13,4033851,762728880_76361333336,949661017_22392791362_127391,confirm,2017-04-05 12:29:03,Control,63.5
14,4033851,762728880_76361333336,949661017_22392791362_127391,confirm,2017-04-05 12:29:01,Control,63.5
15,4033851,762728880_76361333336,949661017_22392791362_127391,confirm,2017-04-05 12:28:52,Control,63.5
16,4033851,762728880_76361333336,949661017_22392791362_127391,step_3,2017-04-05 12:26:08,Control,63.5
17,4033851,762728880_76361333336,949661017_22392791362_127391,step_2,2017-04-05 12:24:43,Control,63.5
...,...,...,...,...,...,...,...
321297,433098,5505424_50616523266,773565250_67577573147_389841,start,2017-05-13 15:30:47,Control,25.0
321305,2908510,814969699_90652851448,562606085_36368381773_92090,start,2017-05-10 22:57:17,Control,34.0
321306,2908510,814969699_90652851448,562606085_36368381773_92090,step_2,2017-05-10 22:56:31,Control,34.0
321307,2908510,814969699_90652851448,562606085_36368381773_92090,step_1,2017-05-10 22:56:23,Control,34.0


In [28]:
df_test = df_merged_age[df_merged_age['Variation'] == 'Test']
df_test

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,clnt_age
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07,Test,79.0
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51,Test,79.0
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22,Test,79.0
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13,Test,79.0
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04,Test,79.0
...,...,...,...,...,...,...,...
321300,1574008,117364417_77840596075,528720790_71583064618_169151,confirm,2017-05-06 23:51:05,Test,55.0
321301,1574008,117364417_77840596075,528720790_71583064618_169151,step_3,2017-05-06 23:50:40,Test,55.0
321302,1574008,117364417_77840596075,528720790_71583064618_169151,step_2,2017-05-06 23:43:47,Test,55.0
321303,1574008,117364417_77840596075,528720790_71583064618_169151,step_1,2017-05-06 23:43:34,Test,55.0


In [37]:
df_no_test_age = df_no_test.groupby('client_id')['clnt_age'].mean().reset_index()
df_no_test_age_list = df_no_test_age["clnt_age"]
df_no_test_age_list.mean()

47.498157973856976

In [34]:
df_test_age = df_test.groupby('client_id')['clnt_age'].mean().reset_index()
df_test_age_list = df_test_age["clnt_age"]
df_test_age_list.mean()

47.16373463664106

In [38]:
# ttest for average duration
#H0: test_average_age == control_average_age
#H1: test_average_age != control_average_age

df_test_clean = df_test.dropna()
df_no_test_clean = df_no_test.dropna()

st.ttest_ind(df_test_age_list,df_no_test_age_list, alternative="two-sided") 

TtestResult(statistic=-2.4161741065932425, pvalue=0.01568808321554057, df=50498.0)