<a href="https://colab.research.google.com/github/SaloniRepo/New99/blob/master/A_B_testing_on_digital_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
import random


In [None]:
import scipy.stats as stats
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.proportion import confint_proportions_2indep
from statsmodels.stats.power import NormalIndPower
from scipy.stats import chi2_contingency
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multitest import multipletests
from statsmodels.formula.api import logit
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
from matplotlib.patches import Patch


In [None]:
df = pd.read_csv('/content/marketing_AB.csv', index_col=0)
df.head()

Unnamed: 0,user id,test group,converted,total ads,most ads day,most ads hour
0,1069124,ad,False,130,Monday,20
1,1119715,ad,False,93,Tuesday,22
2,1144181,ad,False,21,Tuesday,18
3,1435133,ad,False,355,Tuesday,10
4,1015700,ad,False,276,Friday,14


In [None]:
df['test group'].unique()

array(['ad', 'psa'], dtype=object)

In [None]:
df['most ads day'].unique()

array(['Monday', 'Tuesday', 'Friday', 'Saturday', 'Wednesday', 'Sunday',
       'Thursday'], dtype=object)

In [None]:
np.sort(df['most ads hour'].unique())

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [None]:
df.isna().sum()

Unnamed: 0,0
user id,0
test group,0
converted,0
total ads,0
most ads day,0
most ads hour,0


In [None]:
#outliers
Q1=df['total ads'].quantile(0.25)
Q3=df['total ads'].quantile(0.75)
IQR=Q3-Q1
upper_bound=Q3+1.5*IQR
lower_bound=Q1-1.5*IQR
print('Upper bound:', upper_bound)
print('Lower bound:', lower_bound)


Upper bound: 61.5
Lower bound: -30.5


In [None]:
outlier_df=df[(df['total ads']>upper_bound) | (df['total ads']<lower_bound)]
print(outlier_df)

    # this is

        user id test group  converted  total ads most ads day  most ads hour
0       1069124         ad      False        130       Monday             20
1       1119715         ad      False         93      Tuesday             22
3       1435133         ad      False        355      Tuesday             10
4       1015700         ad      False        276       Friday             14
5       1137664         ad      False        734     Saturday             10
...         ...        ...        ...        ...          ...            ...
507287   907513        psa      False         70     Saturday             13
507413   906267        psa      False         65      Tuesday             19
508366   911712        psa       True         70     Thursday             14
508648  1551249         ad      False         64      Tuesday             11
510703   902420        psa      False         69    Wednesday             18

[52057 rows x 6 columns]


In [None]:
n_outliers = len(outlier_df)
total_records = len(df)
outlier_percentage = (n_outliers / total_records) * 100

print(f"Total Number of records: {total_records}")
print(f"Number of outliers: {n_outliers}")
print(f"Percentage of outliers: {outlier_percentage:.2f}%")

Total Number of records: 588101
Number of outliers: 52057
Percentage of outliers: 8.85%


In [None]:
# Create a filtered df without outliers
no_outliers_df = df[~df.index.isin(outlier_df.index)]

# Calculate avg seen ads with/without outliers
mean_ads_overall = df['total ads'].mean()
mean_ads_no_outliers = no_outliers_df['total ads'].mean()
print(f'Avg. ads seen overall: {mean_ads_overall:.0f} ads')
print(f'Avg. ads seen without outliers: {mean_ads_no_outliers:.0f} ads')

Avg. ads seen overall: 25 ads
Avg. ads seen without outliers: 15 ads


In [None]:
df

Unnamed: 0,user id,test group,converted,total ads,most ads day,most ads hour
0,1069124,ad,False,130,Monday,20
1,1119715,ad,False,93,Tuesday,22
2,1144181,ad,False,21,Tuesday,18
3,1435133,ad,False,355,Tuesday,10
4,1015700,ad,False,276,Friday,14
...,...,...,...,...,...,...
588096,1278437,ad,False,1,Tuesday,23
588097,1327975,ad,False,1,Tuesday,23
588098,1038442,ad,False,3,Tuesday,23
588099,1496395,ad,False,1,Tuesday,23


In [None]:
conversion_rate= (df['converted']==True).mean()*100
print(f'Overall conversion rate is: {conversion_rate:.2f}%')

conversion_rate_without_outliers= (no_outliers_df['converted']==True).mean()*100
print(f'Conversion rate without outlier is: {conversion_rate_without_outliers:.2f}%')

rel_diff_conversions=(conversion_rate-conversion_rate_without_outliers)/conversion_rate * 100
print(f'Relative difference in conversion rate: {rel_diff_conversions:.2f}%')


Overall conversion rate is: 2.52%
Conversion rate without outlier is: 1.33%
Relative difference in conversion rate: 47.41%


In [None]:
#check how many outliers got converted
outlier_check= outlier_df['converted'].value_counts(normalize=True)*100  #we use normalise to get the percentage
print(outlier_check)

converted
False    85.154734
True     14.845266
Name: proportion, dtype: float64


Only 15% of outliers got converted , hence we can remove them

In [None]:

#removing  outliers
#Keep only the rows where total ads is between lower_bound and upper_bound (inclusive), and remove outliers.
outlier_df=df[(df['total ads']<=upper_bound) & (df['total ads']>=lower_bound)]
removed_outlier_df=outlier_df.reset_index(drop=True)
removed_outlier_df
#




Unnamed: 0,user id,test group,converted,total ads,most ads day,most ads hour
0,1144181,ad,False,21,Tuesday,18
1,1496843,ad,False,17,Sunday,18
2,1448851,ad,False,21,Tuesday,19
3,1637531,ad,False,47,Wednesday,13
4,1081965,ad,False,61,Tuesday,20
...,...,...,...,...,...,...
536039,1278437,ad,False,1,Tuesday,23
536040,1327975,ad,False,1,Tuesday,23
536041,1038442,ad,False,3,Tuesday,23
536042,1496395,ad,False,1,Tuesday,23


In [None]:
#create two groups
ad_grp=removed_outlier_df[removed_outlier_df['test group']=='ad']
psa_grp=removed_outlier_df[removed_outlier_df['test group']=='psa']
print(ad_grp.shape)
print(psa_grp.shape)

(514716, 6)
(21328, 6)


In [None]:
conversion_ad=(ad_grp['converted']==True).mean()
print(f'conversion_ad Conversion rate: {conversion_ad:.2%}')

conversion_psa=(psa_grp['converted']==True).mean()
print(f'conversion_psa Conversion rate: {conversion_psa:.2%}')

rel_diff_grp=conversion_ad-conversion_psa
print(f'Relative difference in conversion rate: {rel_diff_grp:.2%}')

rel_diff_percentag=(conversion_ad-conversion_psa)/conversion_psa*100
print(f'Relative difference in conversion rate: {rel_diff_percentag:.2f}%')


conversion_ad Conversion rate: 1.34%
conversion_psa Conversion rate: 1.06%
Relative difference in conversion rate: 0.28%
Relative difference in conversion rate: 26.31%


On average, the ad group converts 26.3% better than PSA group (should be considered alongside the low base conversion rate)


Null Hypothesis (H₀): The conversion rate in the ad group equal to the conversion rate in the PSA group (p_ad - p_psa = 0).


Alternative Hypothesis (H₁): The conversion rate in the ad group is higher than in the PSA group (p_ad - p_psa > 0).


In [None]:
#z test

successes = [
    sum(ad_grp['converted'] == True),    # number of conversions in ad group
    sum(psa_grp['converted'] == True)    # number of conversions in psa group
]

nobs = [
    len(ad_grp),    # total number in ad group
    len(psa_grp)    # total number in psa group
]

z_stat, p_value= proportions_ztest(successes, nobs=nobs, alternative='larger')

print(z_stat, p_value)

3.4859086419622503 0.000245234238022522


positive z value shows observation is above the mean

In [None]:
if p_value<0.05:
  print('Reject null hypothesis, The conversion rate in the ad group is higher than in the PSA group, The ad campaign shows statistically significant improvement')
else:
  print('Fail to reject null hypothesis')


Reject null hypothesis, The conversion rate in the ad group is higher than in the PSA group, The ad campaign shows statistically significant improvement


We can say with `95%` confidence that the 'ad' group has a significantly higher conversion rate than the 'psa' group.



In [None]:
#comfidence interval
# Calculate CI
ci = confint_proportions_2indep(
    count1=successes[0],  # successes in ad(treatment) group
    nobs1=nobs[0],        # total in ad group
    count2=successes[1],  # successes in psa(control) group
    nobs2=nobs[1],        # total in psa group
    alpha=0.05            # for 95% CI
)

print(f'95% CI for difference in proportions: ({ci[0]:.2%} - {ci[1]:.2%})')
print(f'\nCurrent Conversion Rate difference: {rel_diff_grp:.2%}')

95% CI for difference in proportions: (0.13% - 0.41%)

Current Conversion Rate difference: 0.28%


Hence 0.28 is lying between 0.13 and 0.41%, The `95%` confidence interval (0.13% - 0.41%) indicates consistent positive effect of th Ad Campaign.

Though we observed a 26.3% relative improvement and can be 95% confident the true effect lies between 0.13% and 0.41%, the small absolute difference suggests limited practical impact despite statistical reliability.

In [None]:


df.head()

Unnamed: 0,user id,test group,converted,total ads,most ads day,most ads hour
0,1069124,ad,False,130,Monday,20
1,1119715,ad,False,93,Tuesday,22
2,1144181,ad,False,21,Tuesday,18
3,1435133,ad,False,355,Tuesday,10
4,1015700,ad,False,276,Friday,14


To estimate revenue impact, we'll calculate additional revenue gained per 1000 user afted running an ad campaign. Assume the avg. revenue per conversion is $55.50

In [None]:
avg_revenue_per_conversion= 55.0
base_revenue=conversion_psa*avg_revenue_per_conversion*1000
print(f'Base revenue: ${base_revenue:.2f}')

expected_revenue=conversion_ad*avg_revenue_per_conversion*1000
print(f'Expected revenue: ${expected_revenue:.2f}')

additional_reveue=expected_revenue-base_revenue
print(f'Additional revenue: ${additional_reveue:.2f}')

Base revenue: $582.80
Expected revenue: $736.12
Additional revenue: $153.32


Based on the A/B testing analysis, implementing the ad campaign generates an additional revenue around $155 per 1,000 users compared to showing PSAs.