# A/B Testing

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu,chi2_contingency

### Reading data 

In [2]:
###read data.csv
df = pd.read_csv('data.csv')
df

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion
0,708746,17/08/2017,17/08/2017,916,103916,30-34,M,15,17,17,7350.000000,1,1.43,2.0,1.0
1,708749,17/08/2017,17/08/2017,916,103917,30-34,M,16,19,21,17861.000000,2,1.82,2.0,0.0
2,708771,17/08/2017,17/08/2017,916,103920,30-34,M,20,25,22,693.000000,0,0.00,1.0,0.0
3,708815,30/08/2017,30/08/2017,916,103928,30-34,M,28,32,32,4259.000000,1,1.25,1.0,0.0
4,708818,17/08/2017,17/08/2017,916,103928,30-34,M,28,33,32,4133.000000,1,1.29,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,1314410,19/08/2017,19/08/2017,45-49,F,109,111,114,1129773,252,358.189997,13,2.00,,
1139,1314411,19/08/2017,19/08/2017,45-49,F,110,111,116,637549,120,173.880003,3,0.00,,
1140,1314412,19/08/2017,19/08/2017,45-49,F,111,113,117,151531,28,40.289999,2,0.00,,
1141,1314414,17/08/2017,17/08/2017,45-49,F,113,114,117,790253,135,198.710001,8,2.00,,


In [3]:
df.columns

Index(['ad_id', 'reporting_start', 'reporting_end', 'campaign_id',
       'fb_campaign_id', 'age', 'gender', 'interest1', 'interest2',
       'interest3', 'impressions', 'clicks', 'spent', 'total_conversion',
       'approved_conversion'],
      dtype='object')

In [4]:
import numpy as np

In [5]:
import scipy.stats as stats

In [6]:
df["campaign_id"].unique()

array(['916', '936', '1178', '45-49', '30-34', '35-39', '40-44'],
      dtype=object)

In [7]:
###count the number of unique campaign_id and count of top 3 campaigns
ncampaigns = df["campaign_id"].nunique()

df["campaign_id"].value_counts()

campaign_id
936      464
1178     243
45-49    144
30-34     99
40-44     71
35-39     68
916       54
Name: count, dtype: int64

### Populating the campaign data target class

In [8]:
df["campaign_id"] = df["campaign_id"].replace("45-49","916")
df["campaign_id"] = df["campaign_id"].replace("40-44","916")
df["campaign_id"] = df["campaign_id"].replace("30-34","1178")
df["campaign_id"] = df["campaign_id"].replace("35-39","916")

In [9]:
df["campaign_id"].value_counts()

campaign_id
936     464
1178    342
916     337
Name: count, dtype: int64

In [10]:
df["campaign_id"].value_counts().to_list()

[464, 342, 337]

In [11]:
### show approved conversions of campaign 916
df[df["campaign_id"] == "916"]["approved_conversion"].value_counts()

approved_conversion
0.0    30
1.0    24
Name: count, dtype: int64

In [12]:
###count nan values in approved_conversion
df["approved_conversion"].isna().sum()

382

In [13]:
df.isna().sum()

ad_id                    0
reporting_start          0
reporting_end            0
campaign_id              0
fb_campaign_id           0
age                      0
gender                   0
interest1                0
interest2                0
interest3                0
impressions              0
clicks                   0
spent                    0
total_conversion       382
approved_conversion    382
dtype: int64

### fill up nan values in approved_conversion with median of 10 values before and after


In [14]:
def fill_na_with_median(series, window_size=10):
    non_nan_values = series.dropna()
    if len(non_nan_values) >= window_size:
        median_value = non_nan_values[-window_size:].median()
        return int(median_value)
    else:
        return np.nan

### Apply the function to fill NaN values in the DataFrame
df['approved_conversion'] = df['approved_conversion'].fillna(df['approved_conversion'].apply(lambda x: fill_na_with_median(df['approved_conversion'])))
df['approved_conversion'].isna().sum() ###Checking if there are any NaN values left

0

### approved conversions for each campaign

In [15]:
observed_counts = df.groupby("campaign_id")["approved_conversion"].sum().astype(int)
observed_counts

campaign_id
1178    477
916     307
936     183
Name: approved_conversion, dtype: int32

### Expected counts under the assumption of no difference (uniform distribution)

In [16]:
expected_counts = np.full_like(observed_counts, np.mean(observed_counts))
expected_counts = pd.Series(expected_counts, index=observed_counts.index)
expected_counts

campaign_id
1178    322
916     322
936     322
dtype: int32

### Perform Statistical tests to determine if there's a significant difference in the distribution

### Chi-squared test

In [17]:
chi2_stat, p_value, dof, expected = chi2_contingency([observed_counts])
alpha = 0.05
if p_value <= alpha:
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

Fail to reject the null hypothesis


### Mann-Whitney U test

In [18]:
group_a = df[df['campaign_id'] == "916"]['approved_conversion']
group_b = df[df['campaign_id'] == "936"]['approved_conversion']

In [19]:
statistic, p_value = mannwhitneyu(group_a, group_b)
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference between the two groups.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference between the two groups.")

Reject the null hypothesis: There is a significant difference between the two groups.
