In [2]:
# imports autoreload for on-the-fly modifications
from importlib import reload

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [64]:
# import functions file
import functions as f
reload(f)
%aimport functions

# import modules
import os

import scipy as sc

import pandas as pd

import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

from sqlalchemy import create_engine, text, inspect, Table, Column, Integer, String, MetaData, ForeignKey

import scipy.stats as st
from scipy import stats
from scipy.stats import chi2_contingency
from scipy.stats.contingency import association

from statsmodels.stats.proportion import proportions_ztest


from dotenv import load_dotenv

from datetime import datetime

In [4]:
import warnings
warnings.filterwarnings('ignore')

# **Importing datasheets**

In [5]:
# demographic data from clients
final_demo = pd.read_csv('sources/raw/df_final_demo.txt', sep=',')

# test vs control groups
final_exp = pd.read_csv('sources/raw/df_final_experiment_clients.txt', sep=',')

# raw data from clients
data1 = pd.read_csv('sources/raw/df_final_web_data_pt_1.txt', sep=',')
data2 = pd.read_csv('sources/raw/df_final_web_data_pt_2.txt', sep=',')

In [None]:
display(final_demo.head(5))
display(final_exp.head(5))
display(data1.head(5))

# **Creating an engine**

In [7]:
# SQL credentials in .env
load_dotenv()
password = os.getenv('PASSWORD')

# Create the Database
database_name = 'project5'
# Set Up Database Connection
engine = create_engine(f'mysql+pymysql://root:{password}@localhost')
# Create Database if it Doesn't Exist
with engine.connect() as conn:
    conn.execute(text(f'CREATE DATABASE IF NOT EXISTS {database_name}'))
# Connect to the Newly Created Database
engine = create_engine(f'mysql+pymysql://root:{password}@localhost/{database_name}')

## **METADATA**
- **client_id**: Every client’s unique ID.
- **variation**: Indicates if a client was part of the experiment.
- **visitor_id**: A unique ID for each client-device combination.
- **visit_id**: A unique ID for each web visit/session.
- **process_step**: Marks each step in the digital process.
- **date_time**: Timestamp of each web activity.
- **clnt_tenure_yr**: Represents how long the client has been with Vanguard, measured in years.
- **clnt_tenure_mnth**: Further breaks down the client’s tenure with Vanguard in months.
- **clnt_age**: Indicates the age of the client.
- **gendr**: Specifies the client’s gender.
- **num_accts**: Denotes the number of accounts the client holds with Vanguard.
- **bal**: Gives the total balance spread across all accounts for a particular client.
- **calls_6_mnth**: Records the number of times the client reached out over a call in the past six months.
- **logons_6_mnth**: Reflects the frequency with which the client logged onto Vanguard’s platform over the last six months.

# **DEMOGRAPHICS**

In [None]:
# first inspection on the demographics file
display(final_demo.shape)
display(final_demo.head())
display(final_demo.isna().sum())

In [None]:
# Removing NaN rows
final_demo2 = final_demo[final_demo.isnull().sum(axis=1) < 2]
final_demo2

In [None]:
# Gender fix
final_demo2["gendr"] = final_demo2["gendr"].replace({"X":"U"})
final_demo2["gendr"].unique()

In [11]:
# Rename the columns
final_demo2 = final_demo2.rename(columns={"clnt_tenure_yr":"tenure_year", "clnt_tenure_mnth":"tenure_month", "clnt_age":"age", "gendr":"gender","num_accts":"number_accounts", "calls_6_mnth":"calls_6_months","logons_6_mnth":"logons_6_months"})

In [None]:
final_demo2["age_group"] = final_demo2["age"].apply(f.age_group)

display(final_demo2)
final_demo2.to_sql("demo2", con = engine, if_exists='replace')

# **First EDA on demography**

In [13]:
final_demo3 = final_demo2.copy()

In [None]:
check = ["tenure_month", "age", "number_accounts","bal" , "calls_6_months" , "logons_6_months"]

f.hist_box_plot(final_demo3, check)


In [None]:
f.categorical_display(final_demo3, "gender")
f.categorical_display(final_demo3, "age_group")

In [None]:
f.categorical_comparison(final_demo3, "age_group", "bal")

# bal by age group

In [17]:
# Group by 'age_group' and sum 'logons_6_months'
l1 = final_demo3.groupby(["age_group"])["logons_6_months"].sum().reset_index()

l1["order"]=l1["age_group"].map(f.order_group)
l1 = l1.sort_values(by="order")
l1 = l1.set_index("order")

In [None]:
# Create a bar plot using the aggregated counts
sns.barplot(data=l1, x='age_group', y='logons_6_months')

In [19]:
# Group by 'age_group' and sum 'logons_6_months'
l2 = final_demo3.groupby(["age_group", "gender"])["logons_6_months"].mean().reset_index()


In [None]:
l2["order"]=l2["age_group"].map(f.order_group)
l2["order"]=l2["order"]+l2["gender"].map(f.order_gender)
l2 = l2.sort_values(by="order")
l2 = l2.set_index("order")

l2

In [None]:
# Create a bar plot
sns.barplot(data=l2, x='age_group', y='logons_6_months', hue='gender')

In [None]:
# Group by 'age_group' and sum 'logons_6_months'
l3 = final_demo3.groupby(["age_group", "gender"])["tenure_month"].mean().reset_index()

l3["order"]=l3["age_group"].map(f.order_group)
l3["order"]=l3["order"]+l3["gender"].map(f.order_gender)
l3 = l3.sort_values(by="order")
l3 = l3.set_index("order")

# Create a bar plot
sns.barplot(data=l3, x='age_group', y='tenure_month', hue='gender')

# Move the legend outside the plot
plt.legend(title='gender', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
l3




##### **1. Who are the primary clients using this online process?**

From these barplots we can clearly see that the primary client (the one with most usage) is the old adult category (ages between 50-65).
Gender wise, Unspecified lead the race, with Males following close behind.
Once all data is being grouped, we see that young adults (age 18-35) with unspecified gender have the highest usage, followed by both male and female old adults.

#####  **2. Are the primary clients younger or older, new or long-standing?**

Regarding tenure, we clearly see Elder (age > 65) has the higher numbers, with an average of over 200 months for both Male and Female genders, followed by teenagers (age < 18) and old adults with around 195 months on average.


#### SECOND DATASET

In [None]:
display(final_exp.shape)
display(final_exp.head())
final_exp.isna().sum()

#output to sql
#final_exp.to_sql("experience", con = engine, if_exists='replace')

#### THIRD DATASET + FOURTH DATASET

In [None]:
display(data1.shape)
display(data1.head(3))
display(data2.shape)
display(data2.head(3))

In [25]:
# joins data1 and data2 files, sends to SQL
t_data = pd.concat([data1, data2], axis=0)
#t_data.to_sql("dataset", con = engine, if_exists='replace')

# Starting A/B construct

In [None]:
# using SQL, joins tables and retrieves the association between the datasheet and the variation table.
"""
statement = (""" """
            SELECT d.client_id, e.Variation, d.visitor_id, d.visit_id, d.process_step, d.date_time
            FROM dataset as d
            LEFT JOIN experience as e             
            ON d.client_id = e.client_id
            WHERE e.Variation IS NOT NULL
            ;
            """ """)
with engine.connect() as con:
    try:
        # Execute the query to fetch results
        answer = pd.read_sql(statement, con)

    except Exception as e:
        print(f"An error occurred: {e}")
        
answer.to_sql("dataset2", con=engine, if_exists="replace")
answer.to_csv("sources/clean/answer.csv")
answer
"""

In [None]:
answer = pd.read_csv("sources/clean/answer.csv")

## Cleanup 
display(answer.shape)
answer = answer.drop_duplicates().sort_values(by="date_time").drop("Unnamed: 0", axis=1).reset_index(drop=True)
answer = answer.drop("visitor_id", axis=1)

answer2 = answer.copy()

answer2['date_time'] = pd.to_datetime(answer2['date_time'])

answer_clean = f.clean_data(answer2, "answer_clean")
answer_clean.to_csv("sources/clean/answer_clean.csv")

answer_clean = pd.read_csv("sources/clean/answer_clean.csv", index_col=0)

#display(answer_clean.head(3))

answer_clean['start_date'] = pd.to_datetime(answer_clean['start_date'])
answer_clean['step_1_date'] = pd.to_datetime(answer_clean['step_1_date'])
answer_clean['step_2_date'] = pd.to_datetime(answer_clean['step_2_date'])
answer_clean['step_3_date'] = pd.to_datetime(answer_clean['step_3_date'])
answer_clean['confirm_date'] = pd.to_datetime(answer_clean['confirm_date'])

#display(answer_clean.dtypes)

answer_clean["SS1"] = (answer_clean['step_1_date'] - answer_clean['start_date']).dt.total_seconds()
answer_clean["S1S2"] = (answer_clean['step_2_date'] - answer_clean['step_1_date']).dt.total_seconds()
answer_clean["S2S3"] = (answer_clean['step_3_date'] - answer_clean['step_2_date']).dt.total_seconds()
answer_clean["S3C"] = (answer_clean['confirm_date'] - answer_clean['step_3_date']).dt.total_seconds()
answer_clean["SC"] = (answer_clean['confirm_date'] - answer_clean['start_date']).dt.total_seconds()

answer_clean["completed"] = answer_clean.apply(lambda row: 1 if (
    pd.notna(row["start_date"]) & 
    pd.notna(row["confirm_date"])) else 0, axis=1)

#display(answer_clean.head(3))

# splitting A
A_answer = answer_clean[answer_clean["variation"]=="Test"].reset_index(drop=True)
A_answer.to_csv("sources/clean/A_answer.csv")

# splitting B
B_answer = answer_clean[answer_clean["variation"]=="Control"].reset_index(drop=True)
B_answer.to_csv("sources/clean/B_answer.csv")


In [28]:
A_clean = pd.read_csv("sources/clean/A_answer.csv", index_col=0)
B_clean = pd.read_csv("sources/clean/B_answer.csv", index_col=0)

#### Data grouping

#### Ho -> time per step is equal for both features

In [None]:
steps = ["SS1", "S1S2" , "S2S3", "S3C","SC"]
sign = 0.05

for step in steps:
    s_stat, s_pval = st.ttest_ind(A_clean[step].dropna(),B_clean[step].dropna(), equal_var=False)
    print(f"{step}: stat = {round(s_stat,3)}, p-value = {round(s_pval,3)}")
    if s_pval > sign:
        print(f"For {step} we cannot reject H0: our samples are similar.")
    if s_pval < sign:
        print(f"For {step} we reject H0: our samples are different.")
    print(A_clean[step].mean(),B_clean[step].mean(),"\n")


In [None]:
# test
A_plot=A_clean[["SS1", "S1S2","S2S3","S3C","SC"]]
sns.lineplot(A_plot.mean(), color = "red" )

# control
B_plot=B_clean[["SS1", "S1S2","S2S3","S3C", "SC"]]
sns.lineplot(B_plot.mean(), color = "green" )

row = "SC"
mean_A = A_clean[row].mean()
mean_B = B_clean[row].mean()

display(f"A:{mean_A} / B:{mean_B}")

#### H0 -> completion/success rate from the new feature is equal to the old feature


In [None]:
## Linh's code

A_clean['completed'] = A_clean['confirm_date'].notnull().astype(int)
B_clean['completed'] = B_clean['confirm_date'].notnull().astype(int)
total_A = len(A_clean)
total_B = len(B_clean)
completed_A = A_clean['completed'].sum()
completed_B = B_clean['completed'].sum()
not_completed_A = len(A_clean) - completed_A
not_completed_B = len(B_clean) - completed_B
# Calculate completion rates
completion_rate_A = completed_A / total_A
completion_rate_B = completed_B / total_B

thresh = completion_rate_B*1.05
#display(round(float(thresh),5))



data = pd.DataFrame({
    'Group': ['Test (New Design)','Control (Old Design)'],
    'Completion Rate': [completion_rate_A, completion_rate_B]
})
# Step 4: Create bar plot to compare completion rates
plt.figure(figsize=(8, 6))
sns.barplot(x='Group', y='Completion Rate', data=data, palette='muted')
plt.axhline(y=thresh, color='red', linestyle='--', label='Fixed Y Value')

# Add labels and title
plt.title('Comparison of Completion Rates Between Old and New Design')
plt.ylabel('Completion Rate')
plt.ylim(0, 1)
plt.show()
# Create the contingency table
contingency_table = np.array([[completed_A, not_completed_A], [completed_B, not_completed_B]])
print(contingency_table)
# Step 3: Perform the chi-squared test
chi2, p_value, _, __ = chi2_contingency(contingency_table)
# Step 4: Interpret the result
alpha = 0.05  # Significance level
print(f"Chi-squared statistic: {chi2:.4f}")
print(f"P-value: {p_value:.4f}")
if p_value < alpha:
    print("Reject the null hypothesis. The variables are independent.")
else:
    print("Fail to reject the null hypothesis. The variables are dependant.")
     
 
success = [completed_A, completed_B]
tests = [total_A, total_B]

alpha = 0.05

stats, p_value = proportions_ztest(success, tests)

if p_value < alpha:
    print(f"Because p_value: {round(p_value,4)} < alpha: {alpha} we can reject the null.")
else:
    print(f"Because p_value: {round(p_value,4)} > alpha: {alpha}, we cannot reject the null.")
    
display(round(completion_rate_A,3)*100)
display(round(completion_rate_B,3)*100)



In [None]:
#Design Effectiveness
merged_df = pd.merge(final_demo3, final_exp, on='client_id', how='inner')

group_counts = merged_df['Variation'].value_counts()

avg_age_gender_group = merged_df.groupby(['gender', 'Variation'])['age'].mean().reset_index()

test_design = merged_df[merged_df['Variation'] == "Test"]
control_design = merged_df[merged_df['Variation'] == "Control"]
control_design['age'] = control_design['age'].fillna(0).astype(int)
test_design['age'] = test_design['age'].fillna(0).astype(int)
test_design['gender'] = test_design['gender'].astype('category')
control_design['gender'] = control_design['gender'].astype('category')

#Was the experiment well-structured?
def check_structured(new, old):
    if new.shape[0] == old.shape[0]:
        return "Experiment was well structured"
    else:
        return "Imbalance"
    
check_structured(test_design, control_design)

#Were clients randomly and equally divided between the old and new designs?
avg_age_test = test_design['age'].mean()
avg_age_control = control_design['age'].mean()

gender_control = control_design['gender'].value_counts(normalize=True)
gender_test = test_design['gender'].value_counts(normalize=True)

#Visualize
plt.figure(figsize=(8, 6))
sns.barplot(x=group_counts.index, y=group_counts.values, palette='muted')
plt.title('Number of Clients in Test and Control Groups')
plt.xlabel('Group')
plt.ylabel('Number of Clients')
plt.show()

In [33]:
# process flow: start -> step_1 -> step_2 -> step_3 -> confirm

In [None]:
A_answer2 = answer[answer["Variation"]=="Test"]


A_grouped = A_answer2.groupby(["process_step"])["visit_id"].count().reset_index()

A_grouped["ordered"]=A_grouped["process_step"].map(f.ordered_grouped)
A_grouped.sort_values(by= "ordered", inplace=True)
A_grouped = A_grouped.set_index("ordered")

A_grouped["percentage"]=round(A_grouped["visit_id"]/A_grouped["visit_id"][0] * 100,2)
A_grouped

In [None]:
B_answer2 = answer[answer["Variation"]=="Control"]

B_grouped = B_answer2.groupby(["process_step"])["visit_id"].count().reset_index()

B_grouped["ordered"]=B_grouped["process_step"].map(f.ordered_grouped)
B_grouped.sort_values(by= "ordered", inplace=True)
B_grouped = B_grouped.set_index("ordered")

B_grouped["percentage"]=round(B_grouped["visit_id"]/B_grouped["visit_id"][0] * 100,2)
B_grouped

In [36]:
merged_df = pd.merge(A_grouped[['process_step', 'visit_id']], 
                     B_grouped[['process_step', 'visit_id']], 
                     on='process_step', suffixes=('_A', '_B'))

completation_rate_df = merged_df[merged_df['process_step'].isin(['start', 'confirm'])]

In [None]:
#Create a contingency table from merged dataframes
contingency_table = completation_rate_df[['visit_id_A', 'visit_id_B']]
print(contingency_table)

#Chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p_value}")

# we conclude the samples are independent.

Cramer test

In [None]:
# Computing the association between variables in 'crosstab_result' using the "cramer" method
crosstab_result = pd.crosstab(contingency_table['visit_id_A'], contingency_table ['visit_id_B'])

association(crosstab_result, method="cramer")

After doing A/B Testing, we recognized that there is a statistically significant difference in completion rate between the two groups.

## first remarks

From basic analysis of group A vs B we can see that from those starting, only 62% reached step1 vs our new-feature's 68.7%. However, step2 is almost tied, which seem to point out towards an issue with step1 that needs improving. New features on step are worsening the score, 46% vs 48% on the old system, but the overall score (so those that complete the steps and confirm) we do see an improvement, showing that the new feature is indeed more efficient (45.6% vs 36.8% on the new feature). 

completion rate -> completion vs version? 
contingency_table = pd.crosstab(df['completion'],df['variation'])

Ho they are independant
H1 they are not independant

#### Ho -> % success PER age_group is equal to the old feature

## **KPI to evaluate**

- Define the KPIs you chose to evaluate the new design’s performance.
- Compare the KPIs for the Control Group vs. the Test Group.
- Present visual aids to support the KPI analysis.

Metrics:
Work objective is comparing 2 applications (one current, one new) and analyze it's perfomance (if it's better for the end user)

- Is the program more user-friendly? (KPIs - %completion rate, %errors)
- Is the program more efficient? (KPIs - Time Spent on Each Step, cost-efficient for implementation).

---

### 4. Analyzing A/B Test Results:

1. **Collect Data:** Track and gather data on how each group interacts with the content.
2. **Statistical Analysis:** Use statistical tests (e.g., t-test) to determine if the differences observed are statistically significant.
3. **Draw Conclusions:** If Version B significantly outperforms Version A, consider implementing the change. If not, revert to the original or consider new tests.

**Note**: in this week's project, you'll be analyzing A/B Test Results, since the design and the collection of data was already done.

---

Hypothesis Testing

As part of your analysis, you’ll conduct hypothesis testing to make data-driven conclusions about the effectiveness of the redesign. See the full details below:

- Completion Rate: Given the data and KPIs you have explored discussed, one interesting hypothesis to test is related to the completion rate between the Test and Control groups. Since the new design (Test group) had a higher completion rate compared to the old design (Control group), you are required to confirm if this difference is statistically significant.

Make sure to define the proper null and an alternative hypothesis to test it. Use the provided data to test these hypotheses, and determine if you can reject the null hypothesis in favor of the alternative. Make sure to consider the significance level, p-value, the statistical test prerequisites, and other relevant statistical measures in your analysis.

- Completion Rate with a Cost-Effectiveness Threshold: The introduction of a new UI design comes with its associated costs: design, development, testing, potential training for staff, and possible short-term disruptions or adjustments for users. To justify these costs, Vanguard has determined that any new design should lead to a minimum increase in the completion rate to be deemed cost-effective.

Threshold: Vanguard has set this minimum increase in completion rate at 5%. This is the rate at which the projected benefits, in terms of increased user engagement and potential revenue, are estimated to outweigh the costs of the new design.

You are required to carry out another analysis, ensuring that the observed increase in completion rate from the A/B test meets or exceeds this 5% threshold. If the new design doesn’t lead to at least this level of improvement, it may not be justifiable from a cost perspective, regardless of its statistical significance.

- Other Hypothesis Examples
You have been given the freedom to choose another hypothesis to test. Here are some examples:

You might want to test whether the average age of clients engaging with the new process is the same as those engaging with the old process
You might want to test if the average client tenure (how long they’ve been with Vanguard) of those engaging with the new process is the same as those engaging with the old process
You might want to test if there are gender differences that affect engaging with the new or old process
Make sure to define the proper null and alternative hypothesis to test it. You are required to choose one or come up with another of your own to test.

In [39]:
# demographic data from clients
demo_exp = pd.merge(left = final_demo3, right = final_exp, on="client_id")

#demo_exp["Variation"].unique() # array([nan, 'Test', 'Control'], dtype=object)

demo_test = demo_exp[demo_exp["Variation"]=="Test"]
demo_control = demo_exp[demo_exp["Variation"]=="Control"]

In [None]:
demo_test.head(5)

## test clients
demo_test_count = demo_test["client_id"].count()
demo_test_age = demo_test.groupby("age_group")["client_id"].count().reset_index()

sns.barplot(demo_test_age, x = "age_group", y="client_id", hue="age_group")
plt.title("Test clients by age_group")
plt.ylabel("Nr. of clients")
plt.show()

## control clients
demo_control_count = demo_control["client_id"].count()
demo_control_age = demo_control.groupby("age_group")["client_id"].count().reset_index()

sns.barplot(demo_control_age, x = "age_group", y="client_id", hue="age_group")
plt.title("Control clients by age_group")
plt.ylabel("Nr. of clients")
plt.show()

## counts clients
total_demo_count = demo_test_count + demo_control_count

print(f"Our total sample has {int(total_demo_count)} clients, with {int(demo_test_count)} test clients ({round(demo_test_count/total_demo_count*100,3)}%) and {int(demo_control_count)} control clients ({round(demo_control_count/total_demo_count*100,3)}%).")


In [None]:
demo_age = pd.merge(demo_test_age, demo_control_age, on= "age_group").rename(columns={"client_id_x":"test_group","client_id_y":"control_group"})

df_long = pd.melt(demo_age, id_vars=['age_group'], value_vars=['test_group', 'control_group'],
                  var_name='group', value_name='value')

## is age_group equally distributed between test and control groups?

sns.barplot(data=df_long, x='age_group', y='value', hue='group')
plt.legend(title="Data groups", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.ylabel("Nr. of Clients")
plt.xlabel("Age group of clients")


In [None]:
demo_age

# Calculating the Pearson correlation coefficient between 'test_group' and 'control_group' columns
p_correlation = demo_age['test_group'].corr(demo_age['control_group'])
p_correlation

# Calculating the Spearman rank corr between 'test_group' and 'control_group' columns
s_correlation = demo_age['test_group'].corr(demo_age['control_group'], method='spearman')
s_correlation

print(f"Pearson's correlation coef: {round(p_correlation,5)}, Spearman's rank: {round(s_correlation,5)}")

In [None]:
#Create a contingency table from merged dataframes
ct_table = demo_age[['test_group', 'control_group']]
print(ct_table)

#Chi-square test
chi2_ct, p_value_ct, _, _ = chi2_contingency(ct_table)
print(f"Chi-square statistic: {chi2_ct}")
print(f"P-value: {p_value_ct}")

sig=0.05

#H0: samples are independant regardless of age_Group.

if p_value_ct < sig:
    print("Reject the null hypothesis.")
else:
    print("Fail to reject the null hypothesis.")

In [None]:
data_cleaned = pd.concat([A_answer, B_answer])

demo_age[demo_age["age_group"]=="teenager"]

In [45]:
data_cleaned2 = data_cleaned.copy()

In [46]:
data_cleaned2["date_time"] = 0

data_cleaned2["date_time"] = data_cleaned2.apply(lambda row: row["start_date"] if pd.notna(row["start_date"]) else row["confirm_date"], axis=1)

data_cleaned2["completed"] = data_cleaned2.apply(lambda row: 1 if (
    pd.notna(row["start_date"]) & 
    pd.notna(row["confirm_date"])) else 0, axis=1)



In [None]:
print(f"A: {total_A} total / {completed_A} completed.")
print(f"B: {total_B} total / {completed_B} completed.")


In [None]:
A_clean = data_cleaned2[data_cleaned2["variation"]=="Test"]
B_clean = data_cleaned2[data_cleaned2["variation"]=="Control"]

A_clean_c = A_clean[A_clean["completed"] == 1]
B_clean_c = B_clean[B_clean["completed"] == 1]

A_clean['date_time'] = pd.to_datetime(A_clean['date_time'])
A_clean_c['date_time'] = pd.to_datetime(A_clean_c['date_time'])

B_clean['date_time'] = pd.to_datetime(B_clean['date_time'])
B_clean_c['date_time'] = pd.to_datetime(B_clean_c['date_time'])

weekly_activity = A_clean.set_index('date_time').resample('W').size().reset_index()
weekly_completion = A_clean_c.set_index('date_time').resample('W').size().reset_index()
weekly_activity = pd.merge(left = weekly_activity, right = weekly_completion, on="date_time").rename(columns={"0_x":"total","0_y":"completed"})

weekly_activity2 = B_clean.set_index('date_time').resample('W').size().reset_index()
weekly_completion2 = B_clean_c.set_index('date_time').resample('W').size().reset_index()
weekly_activity2 = pd.merge(left = weekly_activity2, right = weekly_completion2, on="date_time").rename(columns={"0_x":"total","0_y":"completed"})

plt.figure(figsize=(10, 6))
# Reshape the data into long format
df_long = weekly_activity.melt(id_vars="date_time", value_vars=['total', 'completed'], 
                  var_name='status', value_name='count')
weekly_activity = weekly_activity.set_index('date_time')
weekly_activity["complete_rate_%"] = weekly_activity.apply(lambda x: round((x["completed"] / x["total"]),3)*100 if x["total"] > 0 else 0, axis=1)

# Reshape the data into long format
df_long2 = weekly_activity2.melt(id_vars="date_time", value_vars=['total', 'completed'], 
                  var_name='status', value_name='count')
weekly_activity2 = weekly_activity2.set_index('date_time')
weekly_activity2["complete_rate_%"] = weekly_activity2.apply(lambda x: round((x["completed"] / x["total"]),3)*100 if x["total"] > 0 else 0, axis=1)


# Plot the A data using seaborn
plt.figure(figsize=(10, 6))
sns.barplot(y='date_time', x='count', hue='status', data=df_long)
plt.title('Test user Activity Per Week During the Experiment')
plt.xlabel('Week')
plt.ylabel('Number of Visits')
plt.xticks(rotation=0)
#plt.grid(True)
plt.tight_layout()
plt.show()

# Plot the B data using seaborn
plt.figure(figsize=(10, 6))
sns.barplot(y='date_time', x='count', hue='status', data=df_long2)
plt.title('Control User Activity Per Week During the Experiment')
plt.xlabel('Week')
plt.ylabel('Number of Visits')
plt.xticks(rotation=0)
#plt.grid(True)
plt.tight_layout()
plt.show()

In [49]:
week = weekly_activity.merge(weekly_activity2, on = "date_time")

In [None]:
## merge phase
demo_answer = answer_clean.merge(final_demo3, how = "left", on="client_id")

# cleans absent clients from demographics
demo_answer = demo_answer[~demo_answer["age_group"].isna()]

demo_answer

In [51]:
demo_answer.to_csv("sources/clean/demo_answer.csv")

In [None]:
dd = demo_answer.groupby("age_group")["client_id"].count().reset_index()

dd["order"] = dd["age_group"]

dd["order"] = dd["order"].replace({"teenager":0, "young_adult":1, "adult":2, "old_adult":3, "elder":4})
dd = dd.sort_values(by="order")
dd = dd.set_index("order")

sns.barplot(dd, x="age_group", y="client_id")

### **Statistics**

In [None]:
# Descriptive statistics for numerical columns in Test and Control groups
test_group = demo_answer[demo_answer['variation'] == 'Test']
control_group = demo_answer[demo_answer['variation'] == 'Control']

# Summary statistics for numerical columns
numerical_columns = ['age', 'tenure_year', 'number_accounts', 'bal', 'calls_6_months', 'logons_6_months']
test_summary_mean = test_group[numerical_columns].mean().round(3).reset_index()
test_summary_std = test_group[numerical_columns].std().round(3).reset_index()
test_summary_var = test_group[numerical_columns].var().round(3).reset_index()

control_summary_mean = control_group[numerical_columns].mean().round(3).reset_index()
control_summary_std = control_group[numerical_columns].std().round(3).reset_index()
control_summary_var = control_group[numerical_columns].var().round(3).reset_index()

mean_summary = pd.merge(test_summary_mean, control_summary_mean, on="index").rename(columns={"0_x":"test","0_y":"control","index":"mean"})
std_summary = pd.merge(test_summary_std, control_summary_std, on="index").rename(columns={"0_x":"test","0_y":"control","index":"std"})
var_summary = pd.merge(test_summary_var, control_summary_var, on="index").rename(columns={"0_x":"test","0_y":"control","index":"var"})


# Check the distribution of categorical features
categorical_columns = ['gender', 'age_group']
test_categorical = test_group[categorical_columns].value_counts(normalize=True).round(3).reset_index()
control_categorical = control_group[categorical_columns].value_counts(normalize=True).round(3).reset_index()

test_categorical["order"] = test_categorical["age_group"].map(f.order_group) + test_categorical["gender"].map(f.order_gender)
test_categorical = test_categorical.sort_values(by="order").set_index("order", drop=True)
control_categorical["order"] = control_categorical["age_group"].map(f.order_group) + control_categorical["gender"].map(f.order_gender)
control_categorical = control_categorical.sort_values(by="order").set_index("order", drop=True)

var_summary = pd.merge(test_categorical, control_categorical, on="order").rename(columns={"proportion_x":"test","proportion_y":"control", "gender_x":"gender", "age_group_x":"age_group"}).drop("gender_y", axis=1).drop("age_group_y", axis=1)

#display(mean_summary, std_summary, var_summary)
#display(test_categorical.reset_index(), control_categorical.reset_index())
var_summary

In [None]:
corr_grp = demo_answer[numerical_columns].corr()

plt.figure(figsize=(5, 5))
sns.heatmap(corr_grp, annot=True, cmap="coolwarm")
plt.title("Corr between our data")
plt.show()


In [None]:
display(demo_answer["step_reversions"].mean())

In [None]:
test_group = demo_answer[demo_answer['variation'] == 'Test']
control_group = demo_answer[demo_answer['variation'] == 'Control']

completed_test = test_group["completed"].sum()
total_test = test_group["completed"].count()
uncompleted_test = test_group["completed"].count() - completed_test

completed_control = control_group["completed"].sum()
total_control = control_group["completed"].count() 
uncompleted_control = control_group["completed"].count() - completed_control


print("Test: ", completed_test, uncompleted_test)
print("Control: ",  completed_control, uncompleted_control)

cont_table = np.array([[completed_test, uncompleted_test], [completed_control, uncompleted_control]])
print(cont_table)

# Step 3: Perform the chi-squared test
chi2, p_value, _, _ = chi2_contingency(cont_table)
# Step 4: Interpret the result
alpha = 0.05  # Significance level
print("")
print(f"Chi-squared statistic: {round(chi2,4)}")
print(f"P-value: {round(p_value,4)}")

if p_value < alpha:
    print("\nReject the null hypothesis. Difference is statistically significant.")
else:
    print("\nFail to reject the null hypothesis. We cannot reject the difference to be just statistical variability.")
     
print("")

success = [completed_test, completed_control]
tests = [total_test, total_control]

alpha = 0.05

stats, p_val = proportions_ztest(success, tests)
print(f"Z-test statistic: {round(stats,4)}")
print(f"P-value: {round(p_val,4)}")

if p_val < alpha:
    print(f"\nBecause p_value: {round(p_val,4)} < alpha: {alpha} we can reject the null and the difference in completion rates is statistically significant.")
else:
    print(f"\nBecause p_value: {round(p_val,4)} > alpha: {alpha}, we cannot reject the null, so we can't discard that the difference is not statistically significant.")


In [None]:
# “Is the Error rate of the Test Group lesser than the Control Group?“
demo_answer2 = demo_answer.copy()

demo_answer2["error2"] = demo_answer2.apply(lambda x : 1 if x["error"] > 0 else 0, axis=1)

test_group = demo_answer2[demo_answer2['variation'] == 'Test']
control_group = demo_answer2[demo_answer2['variation'] == 'Control']

completed_test = test_group["error2"].sum()
total_test = test_group["error2"].count()
uncompleted_test = test_group["error2"].count() - completed_test

completed_control = control_group["error2"].sum()
total_control = control_group["error2"].count() 
uncompleted_control = control_group["error2"].count() - completed_control

print("Test: ", completed_test, uncompleted_test)
print("Control: ",  completed_control, uncompleted_control)

cont_table = np.array([[completed_test, uncompleted_test], [completed_control, uncompleted_control]])
print(cont_table)

# Step 3: Perform the chi-squared test
chi2, p_value, _, _ = chi2_contingency(cont_table)
# Step 4: Interpret the result
alpha = 0.05  # Significance level
print("")
print(f"Chi-squared statistic: {round(chi2,4)}")
print(f"P-value: {round(p_value,4)}")

if p_value < alpha:
    print("\nReject the null hypothesis. Difference is statistically significant.")
else:
    print("\nFail to reject the null hypothesis. We cannot reject the difference to be just statistical variability.")
     
print("")

success = [completed_test, completed_control]
tests = [total_test, total_control]

# H0 : Test group's error rate is less than or equal to the Control group's error rate.
alpha = 0.05  # Significance level

stats, p_val = proportions_ztest(success, tests, alternative="smaller")

print(f"Z-test statistic: {round(stats, 4)}")
print(f"P-value: {round(p_val, 4)}")

if p_val < alpha:
    print(f"\nBecause p_value: {round(p_val, 4)} < alpha: {alpha}, we reject H0. The Test group has a significantly smaller error rate.")
else:
    print(f"\nBecause p_value: {round(p_val, 4)} > alpha: {alpha}, we cannot reject H0. We can't conclude the Test group has a smaller error rate.")

In [None]:
# “Test takes less time to complete than Control Group?“

# Filter out rows with null 'SC' (Completion times)
demo_answer2 = demo_answer.dropna(subset=['SC'])

# Split into Test and Control groups
test_group = demo_answer2[demo_answer2['variation'] == 'Test']['SC']
control_group = demo_answer2[demo_answer2['variation'] == 'Control']['SC']

# Perform the test
stat, p_value = st.kstest(test_group, 'norm', args=(test_group.mean(), test_group.std()))
print(f"Kolmogorov-Smirnov test statistic: {stat}")
print(f"P-value: {p_value}")

if p_value > 0.05:
    print("The data is normally distributed (fail to reject H0).")
else:
    print("The data is not normally distributed (reject H0).")

# (H0): The Test group takes either the same or longer time to complete than the Control group.

# Perform a one-tailed Mann-Whitney U test (testing if Test group takes less time)
u_stat, p_value = st.mannwhitneyu(test_group, control_group, alternative='less')

print(f"T-test statistic: {u_stat}")
print(f"P-value: {p_value}")

if p_value > 0.05:
    print("Cant reject HO: The Test group takes either the same or longer time to complete than the Control group.")
else:
    print("Reject H0: The Test group takes less time to complete than the Control group.")