In [1]:
import numpy as np, pandas as pd
import seaborn as sns, matplotlib.pyplot as plt
import scipy.stats as ss

In [2]:
df = pd.read_csv("marketing_AB.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,user id,test group,converted,total ads,most ads day,most ads hour
0,0,1069124,ad,False,130,Monday,20
1,1,1119715,ad,False,93,Tuesday,22
2,2,1144181,ad,False,21,Tuesday,18
3,3,1435133,ad,False,355,Tuesday,10
4,4,1015700,ad,False,276,Friday,14


In [4]:
df['test group'].unique()

array(['ad', 'psa'], dtype=object)

### There are 2 groups:<br>AD: People sees direct advertisement<br>PSA: People sees only through Public Service Announcement

In [5]:
#Checking for null values
df.isnull().sum()

Unnamed: 0       0
user id          0
test group       0
converted        0
total ads        0
most ads day     0
most ads hour    0
dtype: int64

In [6]:
# count the numbers of different groups of the categorical column
df['test group'].value_counts()

ad     564577
psa     23524
Name: test group, dtype: int64

In [7]:
# count the True or False of buying products by grouping test_group.
df.groupby('test group')['converted'].value_counts()

test group  converted
ad          False        550154
            True          14423
psa         False         23104
            True            420
Name: converted, dtype: int64

# H0: There is no significant difference among conversion rates
###### ( ad group has no higher conversion rate than PSA group)

# H1: There is significant difference among conversion rates
###### ( ad group has higher conversion rate than PSA group)

In [8]:
# Creating subset using main dataframe
ad=df[df['test group']=='ad']
psa=df[df['test group']=='psa']

In [9]:
#Converted declares whether the person buys the product or not
ad[ad['converted']==True].head()

Unnamed: 0.1,Unnamed: 0,user id,test group,converted,total ads,most ads day,most ads hour
15,15,1461774,ad,True,9,Wednesday,18
44,44,1355531,ad,True,265,Tuesday,12
107,107,1389878,ad,True,1328,Saturday,19
121,121,1475989,ad,True,323,Saturday,20
135,135,1241733,ad,True,246,Friday,20


In [10]:
psa[psa['converted']==True].head()

Unnamed: 0.1,Unnamed: 0,user id,test group,converted,total ads,most ads day,most ads hour
957,957,915026,psa,True,88,Sunday,19
2531,2531,904434,psa,True,7,Tuesday,20
2744,2744,920737,psa,True,77,Monday,13
2751,2751,904131,psa,True,111,Monday,12
2979,2979,901749,psa,True,364,Friday,21


In [11]:
#find the number of ads that are converted
ad_converted=ad['total ads']
psa_converted=psa['total ads']


In [12]:
#Since indexes are in random, reset it to concat easily
ad_converted.reset_index(drop=True, inplace=True)

In [13]:
psa_converted.reset_index(drop=True, inplace=True)

In [14]:
ad_converted

0         130
1          93
2          21
3         355
4         276
         ... 
564572      1
564573      1
564574      3
564575      1
564576      1
Name: total ads, Length: 564577, dtype: int64

In [15]:
#concat the groups into new dataframe
data = pd.DataFrame({'ad_converted': ad_converted, 'psa_converted': psa_converted})

In [16]:
data_new = data[:1000]

In [17]:
data_new.head()

Unnamed: 0,ad_converted,psa_converted
0,130,248.0
1,93,27.0
2,21,13.0
3,355,32.0
4,276,105.0


## T-test

In [18]:
t_stat, p_val= ss.ttest_ind(data_new.ad_converted,data_new.psa_converted)
t_stat , p_val

(8.074816570957674, 1.1557133614433257e-15)

Since, **P-Value** is almost equals to 0, we have strong evidence of a **significant difference in the conversion rates**, supporting the conclusion that the **ad group has a higher conversion rate than the psa group.**