In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import math
from statsmodels.stats.proportion import proportions_ztest


In [2]:

dataset = pd.read_csv("/content/Social_Network_Ads.csv")
print(dataset)

      User ID  Gender  Age  EstimatedSalary  Purchased
0    15624510    Male   19            19000          0
1    15810944    Male   35            20000          0
2    15668575  Female   26            43000          0
3    15603246  Female   27            57000          0
4    15804002    Male   19            76000          0
..        ...     ...  ...              ...        ...
395  15691863  Female   46            41000          1
396  15706071    Male   51            23000          1
397  15654296  Female   50            20000          1
398  15755018    Male   36            33000          0
399  15594041  Female   49            36000          1

[400 rows x 5 columns]


In [3]:
x1 = dataset.iloc[:,1]
x2 = dataset.iloc[:,2]
x3 = dataset.iloc[:,3]
x4 = dataset.iloc[:,4]


# **Population Mean of Male and Female**

In [4]:
PopulationMean = dataset.groupby('Gender', as_index=False).Age.mean()
print(PopulationMean)

   Gender        Age
0  Female  38.411765
1    Male  36.867347


# **Population Variance of Male and Female**

In [5]:
PopulationVariance = dataset.groupby('Gender')
for gender, value in PopulationVariance['Age']:
    print((gender, value.var(ddof=0)))

('Female', 117.34025374855825)
('Male', 100.35995418575592)


# **Ads Purchased of Male and Female**

In [6]:
Ads = dataset.groupby(["Gender", "Purchased"]).size()
print(Ads)

Gender  Purchased
Female  0            127
        1             77
Male    0            130
        1             66
dtype: int64


# **Male Proportion (Q)**

In [7]:
pm = (66/400)
print(pm)
qm = (1-pm)
print(qm)


0.165
0.835


# **Female Proportion (Q)**


In [8]:
pf = (77/400)
print(pf)
qp = (1-pf)
print(qp)

0.1925
0.8075


# **Random Sample of 30**

In [9]:
randomsample = dataset.groupby('Gender', as_index=False).sample(30)
print(randomsample)

      User ID  Gender  Age  EstimatedSalary  Purchased
191  15662067  Female   19            26000          0
102  15584545  Female   32            86000          0
297  15705298  Female   43           112000          1
10   15570769  Female   26            80000          0
224  15575002  Female   35            60000          0
324  15575247  Female   48           131000          1
218  15666675  Female   46            96000          0
153  15619087  Female   36            50000          0
150  15679651  Female   26            15000          0
255  15750056  Female   52            90000          1
52   15744919  Female   29            83000          0
181  15774727  Female   31            71000          0
162  15599533  Female   37            33000          0
258  15569641  Female   58            95000          1
21   15736760  Female   47            49000          1
56   15775562  Female   23            48000          0
182  15694288  Female   32           117000          1
242  15780

In [10]:
pSample = randomsample.groupby(["Gender", "Purchased"]).size()
print(pSample)

Gender  Purchased
Female  0            19
        1            11
Male    0            23
        1             7
dtype: int64


# **Proportion of Males and females whos ads were Purchased**

In [12]:
significance = 0.05

sample_success_a, sample_size_a = (11, 30)
sample_success_b, sample_size_b = (7, 30)

In [13]:
successes = np.array([sample_success_a, sample_success_b])
samples = np.array([sample_size_a, sample_size_b])

In [14]:
stat, p_value = proportions_ztest(count=successes, nobs=samples,  alternative='two-sided')

In [15]:
print('z_stat: %0.3f, p_value: %0.3f' % (stat, p_value))

z_stat: 1.127, p_value: 0.260


In [16]:
if p_value > significance:
   print ("Fail to reject the null hypothesis")
else:
   print ("Reject the null hypothesis - suggest the alternative hypothesis is true")

Fail to reject the null hypothesis


# **Proportion of Males and females whos ads were Not Purchased**

In [17]:
significance = 0.05

sample_success_a, sample_size_a = (19, 30)
sample_success_b, sample_size_b = (23, 30)

In [18]:
successes = np.array([sample_success_a, sample_success_b])
samples = np.array([sample_size_a, sample_size_b])

In [19]:
stat, p_value = proportions_ztest(count=successes, nobs=samples,  alternative='two-sided')

In [20]:
print('z_stat: %0.3f, p_value: %0.3f' % (stat, p_value))

z_stat: -1.127, p_value: 0.260


In [21]:
if p_value > significance:
   print ("Fail to reject the null hypothesis")
else:
   print ("Reject the null hypothesis - suggest the alternative hypothesis is true")

Fail to reject the null hypothesis


# **Hypotheses Testing**

HO: Variance of male and female are equal σ1=σ2

HA: Variance of male and female are not equal σ1!=σ2

In [24]:
Male_sample = randomsample[randomsample['Gender']=='Male']
Female_Sample = randomsample[randomsample['Gender']=='Female']
y1=Male_sample['Age']
y2=Female_Sample['Age']



In [26]:
def f_test(x, y):
    X1=np.array(x1)
    X2=np.array(x2)
    f = np.var(y1, ddof=1)/np.var(y2, ddof=1)
    dfn = y1.size-1 
    dfd = y2.size-1
    p = 1-stats.f.cdf(f, dfn, dfd)
    if p>0.05:
        print(f'The value of p is {p}, We will accept Null Hypothesis')
    else:
        print(f'The value of p is {p}, We will reject Null Hypothesis')




f_test(y1,y2)

The value of p is 0.44751229434969975, We will accept Null Hypothesis


# **The Equality of Ages of Males and Females**

In [27]:
femaleAge = [19, 32, 43, 26, 35, 48, 46, 36, 26, 52, 29, 31, 37, 58, 47, 23, 32, 50, 36, 41, 33, 42, 49, 20, 25, 26, 35, 35, 56, 31]
maleAge = [23, 27, 23, 25, 22, 35, 35, 27, 18, 25, 35, 48, 30, 46, 26, 38, 59, 42, 26, 41, 47, 36, 29, 42, 37, 38, 42, 39, 45, 25]

stats.levene(femaleAge, maleAge, center='median')

LeveneResult(statistic=0.09387283236994226, pvalue=0.7604073870912019)

# **The equality of means of ages of Males and Females**

In [28]:
femaleAge = [19, 32, 43, 26, 35, 48, 46, 36, 26, 52, 29, 31, 37, 58, 47, 23, 32, 50, 36, 41, 33, 42, 49, 20, 25, 26, 35, 35, 56, 31]
maleAge = [23, 27, 23, 25, 22, 35, 35, 27, 18, 25, 35, 48, 30, 46, 26, 38, 59, 42, 26, 41, 47, 36, 29, 42, 37, 38, 42, 39, 45, 25]

stats.levene(femaleAge, maleAge, center='mean')

LeveneResult(statistic=0.16129923998986076, pvalue=0.6894391892770487)

# CONCLUSION

a: There are 95% surety that both male and female equally purchasing ads

b: There are 95% surety that proportion of male and female are equal in terms of notpurchasing ads 

c: We are 95% sure that variances male and female ages are equal

d: We are 95% sure that means male and female are equal




# SUMMARY

After Analyzing the campaign we identify that both men and women have same buying behavior

# TITLE

STATISTICAL MODELING