# Statical test

## Chi-squared test

### Purpose: To find is there any significant association/relationship between 2 qualitative variables

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

In [3]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'tips',
 'titanic']

In [4]:
tips_data = sns.load_dataset('tips')
tips_data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## 1. perform initial anlysis

In [5]:
sex_smoker_data = pd.crosstab(index= tips_data.sex, columns= tips_data.smoker)# margins=True)
sex_smoker_data

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [6]:
stats.chi2_contingency(sex_smoker_data)

(0.008763290531773594,
 0.925417020494423,
 1,
 array([[59.84016393, 97.15983607],
        [33.15983607, 53.84016393]]))

In [7]:
chi_squared_tscore,pval,df,exp_table = stats.chi2_contingency(sex_smoker_data)

In [8]:
chi_square_tscore,pval,df,exp_table = stats.chi2_contingency(sex_smoker_data)
print('Chi-square score is {}.\np value is {}.\ndf is {}.\nExpected table is \n{}'.format(chi_square_tscore,pval,df,exp_table))


Chi-square score is 0.008763290531773594.
p value is 0.925417020494423.
df is 1.
Expected table is 
[[59.84016393 97.15983607]
 [33.15983607 53.84016393]]


In [9]:
if pval<0.05:
    print("We reject the null hypothesis saying that there is a significant relationship between smoker and sex.")
else:
    print("We do not reject the null hypothesis saying that there is no significant relationship between smoker and sex.")

We do not reject the null hypothesis saying that there is no significant relationship between smoker and sex.


## 2. T - test
* 1 sample t test
* 2 sample t test
* Paired test
* 3 sample test - ANOVA

### 1 Sample t test

In [10]:
ages = [21,23,26,24,25,28,29,27,22,32,37,31,35,36,37,34,22,21,25,23,24,26,35,39,38]
len(ages)

25

In [11]:
round(np.mean(ages)) # Average age of the entire population     ## np is numpy

29

In [12]:
random_selection = np.random.choice(ages,size=5)
random_selection

array([34, 22, 23, 29, 26])

In [13]:
from scipy.stats import ttest_1samp # To perform one sample t test
## ttest_1samp  = Calculate the T-test for the mean of ONE group of scores.
# This is a two-sided test for the null hypothesis that the expected value (mean) of a sample of
# independent observations a is equal to the given population mean, popmean.

In [14]:
t_value, pval = ttest_1samp(random_selection,28)
print(t_value,'\n',pval)

-0.5511782546095501 
 0.6108504002950289


In [15]:
if pval<0.05:
    print("We reject the null hypothesis saying that there is a significant difference between average age of pop and samp")
else:
    print("We do not reject the null hypothesis saying that there is no significant difference between average age of pop and sample")

We do not reject the null hypothesis saying that there is no significant difference between average age of pop and sample


#### 2 Sample t test

In [16]:
from scipy.stats import ttest_ind  
# ## Calculate the T-test for the means of two independent samples of scores.
# ## This is a two-sided test for the null hypothesis that 2 independent samples
#     have identical average (expected) values.

In [17]:
from scipy.stats import ttest_ind
#HO: no significant change in Weight of student from classroom_1 and classroom_2
#Ha: there is significant change in Weight of student from classroom_1 and classroom_2

classroom_1=[45,55,56,48,59,50,49,56,48,58]
classroom_2=[65,62,61,67,55,59,80,78,69,66]

t_value,p_val = ttest_ind(classroom_1,classroom_2)
print(t_value,'\n',p_val)

if pval<0.05:
    print("no difference in weight of classroom_1 and classroom_2 hence do not reject null hypothesis")
else:
    print("there is difference in weight of classroom_1 and classroom_2 hence rejecting null hypothesis")


-4.6876269213586275 
 0.0001833553106654498
there is difference in weight of classroom_1 and classroom_2 hence rejecting null hypothesis


#### Paired t test
To check before and after data  of sample paired test is used

In [20]:
from scipy.stats import ttest_rel

In [22]:

weight_preworkout = [89,85,35,95,79,82,99,88,101,72,84,86,92]
weight_postworkout = [55,62,65,56,42,46,68,52,70,60,55,51,69]
statistics,pval = ttest_rel(weight_preworkout,weight_postworkout)
print("Statistics is {} \n pvalue is {}".format(statistics,pval))
if pval < 0.05:
    print("We reject null hypothesis saying that there is a significant relationship between average weight before and after ")
else:
    print("We do not reject null hypothesis saying that there is no significant relationship between average weight before and after")

# INFERENCE: There is chnage in weight of people 
#NH: No change
#AH: There is change in weight

Statistics is 5.07655590683381 
 pvalue is 0.0002722070875643695
We reject null hypothesis saying that there is a significant relationship between average weight before and after 


## ANOVA - more than 2 sample t test

In [23]:
from scipy.stats import f_oneway

In [25]:
#H0
#Ha

#Data Collection
rainfall_2018=[32,41,20,21,22,27,26,37,40,41,42,40]
rainfall_2019=[40,30,20,22,32,37,24,29,30,32,41,41]
rainfall_2020=[30,20,45,44,47,21,20,19,20,21,42,42]
statistic,pvalue=f_oneway(rainfall_2018,rainfall_2019,rainfall_2020)
print('statistics is {}\n pvalue is {}'.format(statistic,pvalue))
if pvalue<0.05:
    print('Tamilnadu recieves different amouth of rainfall') #Ha
else:
    print('Tamilnadu recieves uniform amount of rainfall') #Ho
    
# 12 month rainfall data is collected for 3 years
# Here null hypothesis is accepted 
# inference tamilnadu recieves unifrom amount of rainfall


statistics is 0.07590657652120467
 pvalue is 0.9270641538661927
Tamilnadu recieves uniform amount of rainfall


In [26]:
school_a=[100,98,95,90,89]
school_b=[99,98,96,35,75]
school_c=[70,55,98,92,89]
f_stat,pval=f_oneway(school_a,school_b,school_c)
print("The f-stat is {} and pval is {}".format(f_stat,pval))

if pval<0.05:
    print("We reject the null hypothesis saying there is a significant difference in scores b/w the Students from School A,B and C")
else:
    print("We do not reject the null hypothesis saying that there is no significant difference in scores b/w the Students from School A,B and C")


The f-stat is 0.8624460167233299 and pval is 0.4467184149873274
We do not reject the null hypothesis saying that there is no significant difference in scores b/w the Students from School A,B and C


In [27]:
hun_mt=[50,40,60,50]
two_hun_mt=[60,50,60,55]
marathon=[40,50,60,35]
statistic,pvalue=f_oneway(hun_mt,two_hun_mt,marathon)
print("Statistics {}\n Pvalue {}".format(statistic,pvalue))
if (pvalue<0.05):
    print("Runners have significance of weight ")
else:
    print("Runners don't have significance of weight ")


Statistics 1.4411764705882355
 Pvalue 0.2864377648612401
Runners don't have significance of weight 


### Proportional test in z test

In [12]:
from statsmodels.stats.proportion import proportions_ztest
# can we assume anything from our sample
significance = 0.05
# our sample - 82% are good
sample_success = 410
sample_size = 500
# our Ho is  80%
null_hypothesis = 0.80
# check our sample against Ho for Ha > Ho
# for Ha < Ho use alternative='smaller'
# for Ha != Ho use alternative='two-sided'
stat, p_value = proportions_ztest(count=sample_success, nobs=sample_size, value=null_hypothesis, alternative='larger')
# report
print('z_stat: %0.3f, p_value: %0.3f' % (stat, p_value))
if p_value > significance:
   print ("Fail to reject the null hypothesis - we have nothing else to say")
else:
   print ("Reject the null hypothesis - suggest the alternative hypothesis is true")


z_stat: 1.164, p_value: 0.122
Fail to reject the null hypothesis - we have nothing else to say
