In [44]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import math
import seaborn as sns

**One Sample T Test**  

Example 1

In [8]:
ages = [10,20,30,40,23,43,32,12,23,43,25,34,31,26,27,21,24,27,23,26,28,32,12,23,43,25,34,31,26]

In [9]:
len(ages)

29

In [10]:
ages_mean = np.mean(ages)

# Population mean
ages_mean

27.379310344827587

Null Hypothesis  
H0 : There is no difference

Alternative Hypothesis  
H1 : There is some difference

In [12]:
## Taking random sample

sample_size = 10
age_sample = np.random.choice(ages, sample_size)

In [13]:
age_sample

array([12, 26, 40, 26, 43, 31, 26, 23, 26, 40])

In [15]:
# For a 1 sample t test problem 

from scipy.stats import ttest_1samp

In [17]:
ttest, p_value = ttest_1samp(age_sample, 27)

# considering a random value close to the population mean

In [18]:
print(p_value)

0.4607877215860512


If the p value is less 0.05 , then we reject Null Hypothesis

In [19]:
if p_value < 0.05:
    print("We reject Null Hypothesis")
else:
    print("We accept Null Hypothesis")

We accept Null Hypothesis


Example 2

In [23]:
np.random.seed(6)

# Population
school_ages = stats.poisson.rvs(loc=18,mu=35,size=1500)

# Sample
class_ages = stats.poisson.rvs(loc=18,mu=30,size=60)

In [26]:
# Population Mean
school_ages.mean()

53.303333333333335

In [24]:
# Sample Mean
class_ages.mean()

46.9

In [27]:
_, p_value = stats.ttest_1samp(a=class_ages, popmean=school_ages.mean())

In [28]:
p_value

1.139027071016194e-13

In [29]:
if p_value < 0.05:
    print("We reject Null Hypothesis")
else:
    print("We accept Null Hypothesis")

We reject Null Hypothesis


**Two Sample T Test**

In [31]:
np.random.seed(6)

# Population
school_ages = stats.poisson.rvs(loc=18,mu=35,size=1500)

# Sample 1
classA_ages = stats.poisson.rvs(loc=18,mu=30,size=60)
# Sample 2
classB_ages = stats.poisson.rvs(loc=18,mu=33,size=60)

In [32]:
# 2 sample t test we use ttest_ind

_, p_value = stats.ttest_ind(a=classA_ages, b=classB_ages, equal_var=False)

In [33]:
p_value

0.00030935839774215933

In [34]:
if p_value < 0.05:
    print("We reject Null Hypothesis")
else:
    print("We accept Null Hypothesis")

We reject Null Hypothesis


**Paired T-test**  
To check how differenent sample from the same group are

In [36]:
weight1 = [25,30,12,23,43,25]

# same group of people but in the future
weight2 = weight1 + stats.norm.rvs(scale=5, loc=-1.25, size=6)

In [37]:
weight2

array([27.09095011, 19.68648006,  9.52783857, 18.04252112, 39.94225147,
       27.94963554])

Null Hypothesis  
H0 : There is no statistical difference between weights

Alternative Hypothesis  
H1 : There is statistical difference between weights

In [39]:
weight_df = pd.DataFrame({'weight_10':np.array(weight1),
                         'weight_20':np.array(weight2),
                         'weight_change':np.array(weight2)-np.array(weight1)})

In [40]:
weight_df

Unnamed: 0,weight_10,weight_20,weight_change
0,25,27.09095,2.09095
1,30,19.68648,-10.31352
2,12,9.527839,-2.472161
3,23,18.042521,-4.957479
4,43,39.942251,-3.057749
5,25,27.949636,2.949636


In [41]:
_, p_value = stats.ttest_rel(a=weight1, b=weight2)

In [42]:
p_value

0.24280306777041608

In [43]:
if p_value < 0.05:
    print("We reject Null Hypothesis")
else:
    print("We accept Null Hypothesis")

We accept Null Hypothesis


**Chi Square Test**  
This test is applied when we have 2 categorical variable from a single population. It is used to determine whether there is a significant association between 2 variable  

In [45]:
dataset = sns.load_dataset('tips')

In [47]:
dataset.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [50]:
# to check if 2 categorical feature has any association
# crosstab forms a matrix
dataset_table = pd.crosstab(dataset['sex'], dataset['smoker'])
dataset_table

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


Null Hypothesis  
H0 : There is no relationship between 2 categorical variables

Alternative Hypothesis  
H1 : There is relationship between 2 categorical variables

In [54]:
# Observed Values

observed_values = dataset_table.values
print("Observed Values\n",observed_values)

Observed Values
 [[60 97]
 [33 54]]


In [57]:
val = stats.chi2_contingency(dataset_table)
val

(0.0,
 1.0,
 1,
 array([[59.84016393, 97.15983607],
        [33.15983607, 53.84016393]]))

In [59]:
expected_value = val[3]
expected_value

array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]])

In [61]:
# Since its a 2X2 matrics
no_of_rows = len(dataset_table.iloc[0:2,0])
no_of_cols = len(dataset_table.iloc[0,0:2])

# degrees of freedom
ddof = (no_of_rows - 1) * (no_of_cols - 1)

print("Degrees of Freedom:", ddof)

alpha = 0.05

Degrees of Freedom 1


In [64]:
from scipy.stats import chi2

chi_square = sum([(o-e)**2./e for o,e in zip(observed_values, expected_value)])

chi_square_statistic = chi_square[0] + chi_square[1]

In [66]:
print('chi_square_statistic: ',chi_square_statistic)

chi_square_statistic:  0.001934818536627623


In [67]:
critical_value = chi2.ppf(q=1-alpha, df=ddof)
print('critical_value',critical_value)

critical_value 3.841458820694124


In [72]:
if chi_square_statistic >= critical_value:
    print("We reject Null Hypothesis")
else:
    print("We accept Null Hypothesis")

We accept Null Hypothesis


In [69]:
# p-value

p_value = 1-chi2.cdf(x=chi_square_statistic, df=ddof)

print('p-value', p_value)
print('Significane Level',alpha)
print("Degrees of Freedom:", ddof)

p-value 0.964915107315732
Significane Level 0.05
Degrees of Freedom: 1


In [71]:
if p_value <= alpha:
    print("We reject Null Hypothesis")
else:
    print("We accept Null Hypothesis")

We accept Null Hypothesis
