In [507]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')

from matplotlib import gridspec
%matplotlib inline


from math import factorial

from scipy import stats
from scipy.stats import randint                 # randint is used to generate random variables following a uniform distribution.
from scipy.stats import skewnorm                # variables having a skewed normal distribution.

import random
import statistics

# Large Sample Test.

- If the sample size is sufficiently large (usually, n > 30) then we use the Z-test.

### 1. One Sample Z-test.

In [199]:
# read the students performance data 
df = pd.read_csv('StudentsPerformance.csv')

# display the first two observations
df.head(2)

Unnamed: 0,gender,race/ethnicity,lunch,test preparation course,math score,reading score,writing score,total score,training institute
0,female,group B,standard,none,89,55,56,200,Nature Learning
1,female,group C,standard,completed,55,63,72,190,Nature Learning


In [200]:
# Find the number of female students in the dataframe.
df['gender'].value_counts()

female    517
male      483
Name: gender, dtype: int64

##### There are '517 female students' in the dataset. Consider the reading score of these students.

In [201]:
# Considering only Female students 'reading score' from the dataset.

scores = df[df['gender']=='female']['reading score']
scores.head()

0    55
1    63
2    71
5    85
6    51
Name: reading score, dtype: int64

In [202]:
# step1: let us check the normality of the data.
from scipy.stats import shapiro

stat, p_value = shapiro(scores)

print('Test statistic:',stat)
print('P-value:',p_value)

Test statistic: 0.9949197173118591
P-value: 0.08649472147226334


#####  From the above result, we can see that the p-value is greater than 0.05, thus we can say that the data is normally distributed.

In [203]:
# step2 : let us calculate the Z-value , Z-score and P-value.

z_value = stats.norm.isf(q=0.05)
print('The critical value for one-tailed z-test is:',round(z_value,2))


# Here we can calculate the z-score and p-value together using 'stests'.
from statsmodels.stats import weightstats as stests

z_score, pval = stests.ztest(x1=scores, value=65, alternative='larger')

print("Z-score: ", z_score)
print("p-value: ", pval)

The critical value for one-tailed z-test is: 1.64
Z-score:  2.529410071375873
p-value:  0.005712722457410142


In [204]:
# step3: let us calculate the confidence interval.

interval = stats.norm.interval(0.95, loc=np.mean(scores), scale=statistics.stdev(scores)/np.sqrt(len(scores)))
print('Confidence Interval:',np.round(interval,2))

Confidence Interval: [65.33 67.61]


##### Conclusion:

- Here the z score is greater than 1.64. 
- the p-value is less than 0.05. 
- also the confidence interval does not contain the value in the null hypothesis (i.e. 65).
- thus we reject the null hypothesis and thus, we have enough evidence to conclude that on average girls' score higher marks than 65.

### 2. Two Sample Z-Test.

In [205]:
# null and alternative hypothesis:

# Ho : mu1 - mu2 <= 0
# H1 : mu1 - mu2 >  0

# Right tailed test.

In [206]:
# read the students performance data 
df_student = pd.read_csv('StudentsPerformance.csv')

# display the first two observations
df_student.head(2)

Unnamed: 0,gender,race/ethnicity,lunch,test preparation course,math score,reading score,writing score,total score,training institute
0,female,group B,standard,none,89,55,56,200,Nature Learning
1,female,group C,standard,completed,55,63,72,190,Nature Learning


In [207]:
# Ajenda:
# Nature learning claims that the students trained in their institue have 
# better overall performance than students in the Speak Global learning.

# Seperate the total scores of students based on training institute.

n1_scores = df[df['training institute']=='Nature Learning']['total score']
n2_scores = df[df['training institute']=='Speak Global Learning']['total score']

# number of samples of two different population.
n1 = len(n1_scores)
n2 = len(n2_scores)

# average of the samples 
n1_mean = np.mean(n1_scores)
n2_mean = np.mean(n2_scores)

# Standard deviation of the total score.
n1_std = statistics.stdev(n1_scores)
n2_std = statistics.stdev(n2_scores)

In [208]:
# Let us ****check the normality*** of total score for the students trained from both the institutes.

# pass the total score students to perform the test
stat, pval = shapiro(df['total score'])

print('Test statistic:', stat)
print('P-Value:', p_value)

Test statistic: 0.998780369758606
P-Value: 0.08649472147226334


##### From the above result, we can see that the p-value is greater than 0.05, thus we can say that the total scores of the students trained from both the institutes are normally distributed.

In [209]:
# Let us check the ***equality of variances.***

# perfrom levene's test.
## levene() returns a tuple having the values of test statistics and the corresponding p-value
# pass the total scores of students trained from 'Nature Learning' and 'Speak Global Learning' institutes to perform the test
stat, pval = stats.levene(n1_scores, n2_scores)

print('Test statistic:', stat)
print('P-Value:', p_value)

Test statistic: 0.6422721347822817
P-Value: 0.08649472147226334


##### From the above result, we can see that the p-value is greater than 0.05, thus we can say that the population variances are equal.

In [210]:
# step1: z-test, p-value.

z_value = stats.norm.isf(q=0.01)
print('The critical value for one-tailed z-test:',round(z_value,2))


# to calculate the z-score and p-value for the Two sample test:
# pass the value in null hypothesis to the parameter, 'value'
# pass the one-tailed condition to the parameter, 'alternative'

z_score, pval = stests.ztest(x1=n1_scores, x2=n2_scores, value=0, alternative='larger' )
print("Z-score: ", z_score)
print("p-value: ", pval)



The critical value for one-tailed z-test: 2.33
Z-score:  0.15125511253949914
p-value:  0.43988723840898025


In [211]:
# step2: Confidence interval:

interval = stats.norm.interval(0.99, loc=n1_mean-n2_mean, 
                               scale=np.sqrt(((n1_std**2)/n1) + (n1_std**2)/n2))
print('The 99% confidence interval is:',np.round(interval,2))

The 99% confidence interval is: [-3.55  3.99]


In [212]:
# Given data:

# number of samples:
n1 = 160
n2 = 180

# average of the sample:
n1_mean = 13
n2_mean = 15

# standard deviation of the sample:
n1_std = 4.1
n2_std = 3.5

In [213]:
# Null and Alternative Hypothesis:

# Ho : mu1 - mu2  = 0
# H1 : mu1 - mu2  not=0

# Two tailed test.

In [214]:
# step1: Z-test

z_value = stats.norm.isf(q=0.01/2)
print('Z-value:',round(z_value,2))

z_score = ((n1_mean-n2_mean) - (0)) / np.sqrt((n1_std**2/n1) + (n2_std**2/n2))
print('Z-score:',round(z_score,2))

Z-value: 2.58
Z-score: -4.81


In [215]:
# step2: P-value test:

# Here cdf() because its left tailed if right tailed then sf().
p_value = stats.norm.cdf(z_score)

# Here multiply p-value by 2 beacuse its Two tailed test
p_value = p_value*2
print('P-value:',round(p_value,4))

P-value: 0.0


In [89]:
# another method to solve Z-score and P-value

#*********** this method cannot be used when dataset is not given

from statsmodels.stats import weightstats as stests

z_score, pval = stests.ztest(x1=n1, x2=n2, value=0, alternative='larger')

In [216]:
# Step3 : Confidence Interval:

interval = stats.norm.interval(0.99, loc=n1_mean-n2_mean,
                              scale=np.sqrt(((n1_std**2/n1) + (n2_std**2/n2))))
print('The 99% Confidence interval is:',np.round(interval,2))

The 99% Confidence interval is: [-3.07 -0.93]


#####  Conclusion:

- Here the z-score is less than -2.58.
- the p-value is less than 0.01.
- also the confidence interval does not contain the value in the null hypothesis (i.e. 0).
- Thus, we reject the null hypothesis and conclude that both males and females have different hemoglobin averages.

# Small Sample Test:

- If the sample size is small (usually, n < 30) then we use the t-test. These tests are also known as exact tests.

### 1. One Sample Test.

In [248]:
# read the students performance data 
df = pd.read_csv('mathscore_1ttest.csv')

# display the first two observations
df.head(2)

Unnamed: 0,gender,race/ethnicity,lunch,test preparation course,math score,reading score,writing score,total score,training institute
0,female,group C,standard,none,60,72,74,206,Nature Learning
1,female,group C,standard,none,59,72,68,199,Nature Learning


In [249]:
# we need the math scores of the female students only.
df['gender'].unique()

# Here the data consists only female students no worry about filtering gender.

# Now only pull out the math scores.
math_score = df['math score']

# let us calculate the samp_mean, samp_std
samp_mean = np.mean(math_score)
samp_std = statistics.stdev(math_score)
n = len(math_score)
dof = n-1

In [250]:
# Let us check the normality of the data:

from scipy.stats import shapiro


stat, pval = stats.shapiro(math_score)
print('stat:',stat)
print('pval:',pval)

stat: 0.9368310570716858
pval: 0.13859796524047852


##### From the above result, we can see that the p-value is greater than 0.05, thus we can say that the data is normally distributed.

In [251]:
# The null and alternative hypothesis:

# Ho : mu >= 75
# H1 : mu <  75,   consider this first


# left tailed test.

In [252]:
# Step1: t-test

t_val = round(stats.t.isf(q = 0.1, df = 23), 2)

print('Critical value for one-tailed t-test:', t_val)

Critical value for one-tailed t-test: 1.32


In [253]:
# This is the calculation of the t_score and pval
# with the inbuilt function::**********(inbuilt function can be used when theres dataset is given)

# **** the inbuilt function it gives the p_value for two tailed so, **(divide it by 2)** for one tailed 

t_stat,pval = stats.ttest_1samp(a=math_score, popmean=75)


In [254]:
# dividing the p_val by 2 beacuse its left tailed test.

req_pval = pval/2

print('t-stat:',t_stat)
print('pval:',req_pval)

t-stat: -3.6067380757023204
pval: 0.0007426613957678669


In [255]:
# step2 : calculate 90% confidence interval

interval = stats.norm.interval(0.90, loc=samp_mean, scale=samp_std/np.sqrt(n))
print('The 90% confidence interval is:',np.round(interval,2))

The 90% confidence interval is: [62.56 70.35]


##### Conclusion:

- We can see that the test statistic value is less than -1.32.
- the p-value is less than 0.1.
- also the confidence interval does not contain the value in the null hypothesis (i.e. 75).
- Thus, we reject the null hypothesis and conclude that the proposed claim, i.e. in math test female students tend to get more marks than the average marks of 75 out of 100, is correct.

In [256]:
# step1:  Null and alternative hypothesis:

# Ho : mu >= 100
# H1 : mu < 100

# Left tailed test.

In [257]:
# Given data:

pop_mean = 100
n = 10
samp_mean = 94.8
samp_variance = 72.66
samp_std = np.sqrt(samp_variance)
alpha = 0.05
df = n-1

In [258]:
# Step1: t-test

t_value = stats.t.isf(q=0.05,df=n-1)
print('The critical value for one-tailed t-test is:',round(t_value,2))

t_score = (samp_mean - pop_mean) / (samp_std/np.sqrt(n))
print('t-score:',round(t_score,2))

The critical value for one-tailed t-test is: 1.83
t-score: -1.93


In [259]:
# We can also call a simple function for getting t-score:

def ttest(pop_mean,samp_mean,samp_std,n):
    t_score = (samp_mean-pop_mean)/ (samp_std/np.sqrt(n))
    return t_score
pop_mean = 100
n = 10
samp_mean = 94.8
samp_variance = 72.66
samp_std = np.sqrt(samp_variance)

t_score = ttest(pop_mean,samp_mean,samp_std,n)
print('t-score:',round(t_score,2))

t-score: -1.93


In [260]:
# step2: P-value test:

p_value = stats.t.cdf(t_score,df=n-1)    # using cdf() because its left tailed test
print('The P-value:',round(p_value,2))

The P-value: 0.04


In [261]:
# step3: Confidence Interval for 95%

interval = stats.t.interval(0.95, df=n-1, loc=samp_mean, scale=samp_std/np.sqrt(n))
print('The 95% confidence Interval is:',np.round(interval,2))

The 95% confidence Interval is: [ 88.7 100.9]


##### Conclusion.

- We can see that the test statistic value is less than -1.83.
- the p-value is less than 0.05. 
- Thus, we reject the null hypothesis and can conclude that the average bacteria per unit volume (true mean) is within the safety levels.

# Two Sample t Test (Unpaired)

- INDEPENDENT SAMPLES

In [262]:
# read the students performance data 
df = pd.read_csv('totalmarks_2ttest.csv')

# display the first two observations
df.head(2)

Unnamed: 0,gender,race/ethnicity,lunch,test preparation course,math score,reading score,writing score,total score,training institute
0,male,group E,standard,completed,84,83,78,245,Speak Global Learning
1,male,group C,free/reduced,completed,79,77,75,231,Speak Global Learning


In [263]:
# we need to filter out the total score of the students,
# who are completed test preparation course and not completed course.

In [264]:
# check the unique entrys of test preparation course.
df['test preparation course'].unique()

# lets separate the students total score based on test prepartaion course.

# these are the students total score who completed the test preparation course.
n1_tscore = df[df['test preparation course']=='completed']['total score']

# these are the students total score who are not completed the test preparation course.
n2_tscore = df[df['test preparation course']=='none']['total score']


In [265]:
# Calculate the required using the samples dataset:

# number of the samples:
n1 = len(n1_tscore)
n2 = len(n2_tscore)

# average of the samples:
n1_mean = np.mean(n1_tscore)
n2_mean = np.mean(n2_tscore)

# standard deviation of the samples:
n1_std = statistics.stdev(n1_tscore)
n2_std = statistics.stdev(n2_tscore)

# degree of freedom
dof = n1+n2-2

In [268]:
# let us check the normality of the total score of the students:

stat, pval = shapiro(df['total score']) 
print('Test statistic:', stat)
print('P-Value:', pval)

Test statistic: 0.9845393300056458
P-Value: 0.9080941677093506


##### From the above result, we can see that the p-value is greater than 0.05, thus we can say that the total marks of students who have/ have not completed the test preparation course are normally distributed.

In [281]:
# let us check the equality of the variances:
# (this is used only for the two sample tests)

stat, pval = stats.levene(n1_tscore,n2_tscore)
print('Test statistic:', stat)
print('P-Value:', pval)

Test statistic: 0.045113770764648356
P-Value: 0.8331854285659768


##### From the above result, we can see that the p-value is greater than 0.05, thus we can say that the population variances are equal.

In [282]:
# step1: Null and alternative hypothesis:

# Ho : mu1 - mu2 = 0
# H1 : mu1 - mu2 not=0.

# Two tailed test:

In [283]:
# step2: t-test:

# here pass the alpha/2 for q because its two tailed test.
t_value = stats.t.isf(q=0.05/2,df=31)
print('The critical value for two-tailed t-test is:',round(t_value,2))

The critical value for two-tailed t-test is: 2.04


In [287]:
# step3: confidence interval:

# first find S :
s = np.sqrt((((n1-1)*n1_std**2) + ((n2-1)*n2_std**2)) / (n1+n2-2))

interval = stats.t.interval(0.95,df=n1+n2-2 ,loc=n1_mean-n2_mean, 
                           scale=s*np.sqrt(1/n1 +1/n2))
print('The 95% confidence interval is:',np.round(interval,4))

The 95% confidence interval is: [-7.2553 41.9887]


##### Conclusion.

- We can see that the test statistic value is less than 2.04 and greater than -2.04.
- the p-value is greater than 0.05.
- also the confidence interval contains the value in the null hypothesis (i.e. 0).
- Thus, we fail to reject (i.e. accept) the null hypothesis and conclude that there is no difference in the total marks of the students who have completed the preparation course and who have not completed the preparation course.

In [338]:
# Given data:

# number of samples of the data:
n1 = 14
n2 = 15

# average of the sample
n1_mean = 6.4
n2_mean = 7.3

# standard deviation of the sample 
n1_std = 1.4
n2_std = 1.5

# degree of freedom
dof = n1+n2-2
dof

27

In [307]:
# step1: Null and Alternative hypothesis:

# Ho : mu1 - mu2 = 0
# H1 : mu1 - mu2 not=0.

# Two tailed test:

In [312]:
# step2: t-test:

# pass alpha/2 for q because its two tailed test.
t_value = stats.t.isf(q=0.01/2,df=dof)
print('The critical value for two-tailed t-test is:',round(t_value,2))


# first find s 
s = np.sqrt( ( ((n1-1)*n1_std**2) +  ((n2-1)*n2_std**2)  ) / (n1+n2-2)) 

t_score = ((n1_mean-n2_mean) - (0)) / (s*np.sqrt(1/n1 + 1/n2))
print('t-score:',round(t_score,2))

The critical value for two-tailed t-test is: 2.77
t-score: -1.67


In [313]:
# step3: P-value test:

p_value = stats.t.cdf(t_stat, df = 27)
# for a two-tailed test multiply the p-value by 2
req_p = p_value*2
print('p-value:', req_p)

p-value: 0.0012401306424251629


In [317]:
# step4: Confidence Interval:

interval = stats.t.interval(0.99,df=dof, loc=n1_mean-n2_mean,
                           scale=s*np.sqrt(1/n1+1/n2))
print('95% confidence interval for population mean is', interval)

95% confidence interval for population mean is (-2.395737993074164, 0.5957379930741649)


##### Conclusion.

- We can see that the test statistic value is greater than -2.77. 
- the p-value is greater than 0.01.  also the confidence interval contains the value in the null hypothesis (i.e. 0).
- Thus, we fail to reject (i.e. accept) the null hypothesis and conclude that the two medicines have the same hours of relief.

# Paired t Test.

- DEPENDENT SAMPLES

In [318]:
# read the file containing writing scores  
df_score = pd.read_csv('WritingScores.csv')

# display the first two observations
df_score.head(2)

Unnamed: 0,score_before,score_after
0,59,50
1,62,67


In [322]:
# Calculate the difference of the marks.
diff_marks = df_score['score_after'] - df_score['score_before']

# calculate the mean difference
mean_diff = np.mean(diff_marks)

## calculate standard deviation of difference
std_diff = statistics.stdev(diff_marks)

# sample size
n = len(df_score)

print('Degrees of freedom:', n-1)


Degrees of freedom: 16


In [323]:
# let us check the normality of the scores before the tranining
stat, p_value = shapiro(df_score['score_before'])
print('Test statistic:', stat)
print('P-Value:', p_value)

Test statistic: 0.9473825097084045
P-Value: 0.416460782289505


##### From the above result, we can see that the p-value is greater than 0.05, thus we can say that the scores before training are normally distributed.

In [324]:
# Let us check the normality of the score after the training.
stat, p_value = shapiro(df_score['score_after'])
print('Test statistic:', stat)
print('P-Value:', p_value)

Test statistic: 0.9686523675918579
P-Value: 0.7944130897521973


##### From the above result, we can see that the p-value is greater than 0.05, thus we can say that the scores after training are normally distributed.

In [622]:
# step1: Null and alternative hypothesis:

# Ho : mu1 - mu2 = 0
# H1 : mu1 - mu2 not= 0

# two tailed test.

In [327]:
# step2 : t-test

# here divide alpha/2 because its two tailed test
t_value = stats.t.isf(q=0.05/2,df=16)
print('The critical value for two-tailed t-test is:',round(t_value,2))

# Here the dataset is given hence use inbuilt function.
# Here this is depedent so pass rel(relative)
stat, p_val = stats.ttest_rel(df_score['score_after'], df_score['score_before'])
print("Test Statistic:", t_stat)
print("p-value:", p_val)

The critical value for two-tailed t-test is: 2.12
Test Statistic: -3.6067380757023204
p-value: 0.16929012896279846


In [328]:
# step3: Confidence IntervaL:

# In here for scale no need of entering two standard deviations just entered std_diff like one sample test.
interval = stats.t.interval(0.95, df = n-1, loc = mean_diff, scale = std_diff/np.sqrt(n))
print('90% confidence interval for population mean is', interval)

90% confidence interval for population mean is (-3.0029069531297283, 15.708789306070905)


##### Conclusion.

- We can see that the test statistic value is less than 2.12 and greater than -2.12.
- the p-value is greater than 0.05.
- also the confidence interval contains the value in the null hypothesis (i.e. 0). Thus, we fail to reject (i.e. accept) the null hypothesis and conclude that the training was not effective.

In [347]:
# Given data:
sales_before = [33, 32, 38, 45, 37, 47, 48, 41, 45]
sales_after = [42, 35, 31, 41, 37, 36, 49, 49, 48]

# number of samples:
n = len(sales_before)

# difference of the sales before and after:
diff_sales = np.array(sales_before)-np.array(sales_after)

# average of the difference sales:
mean_diff = np.mean(mean_diff)

# standard deviation of the different sales:
std_diff = statistics.stdev(diff_sales)

# Degree of freedom
dof = n-1
dof

8

## The Null and Alternative Hypothesis:

- Ho : mu1 - mu2 <= 0
- H1 : mu1 - mu2 > 0

- Right tailed test.

In [349]:
# Step1: t-test

t_value = stats.t.isf(q=0.05,df=8)
print('The critical value of the right tailed t-test is:',round(t_value,2))


The critical value of the right tailed t-test is: 1.86


In [350]:
# here the dataset is given we can use the builtint function to calculate t-score and p-value

t_score, pval = stats.ttest_rel(sales_after,sales_before)
print('t-score:',t_score)
print('P-value:',pval)

t-score: 0.10085458113185983
P-value: 0.9221477146925299


In [354]:
# step2: Confidence Interval estimation:

interval = stats.t.interval(0.95,df=8,loc=mean_diff , scale=std_diff/np.sqrt(n))
print('The confidence interval is:',np.round(interval,2))

The confidence interval is: [ 1.31 11.39]


##### Conclusion:

- We can see that the test statistic value is less than 1.86.
- The confidence interval doesnot contain the population mean. 
- Thus, we fail to reject (i.e. accept) the null hypothesis and conclude that there is no effect of advertisement.


# Z Propotion test.

### 1. One sample test.

-The null and alternative hypothesis is given as:

<p style='text-indent:25em'> <strong> $H_{0}: P = P_{0}$ or $P \geq P_{0}$ or $P \leq P_{0}$</strong></p>
<p style='text-indent:25em'> <strong> $H_{1}: P \neq P_{0}$ or $P < P_{0}$ or $P > P_{0}$</strong></p>

In [356]:
df= pd.read_csv('StudentsPerformance.csv')
df.head(2)

Unnamed: 0,gender,race/ethnicity,lunch,test preparation course,math score,reading score,writing score,total score,training institute
0,female,group B,standard,none,89,55,56,200,Nature Learning
1,female,group C,standard,completed,55,63,72,190,Nature Learning


In [398]:
# filter out only the male students and their math score.
df_male = df[df['gender']=='male']['math score']

# total number of the male students:
total_male = len(df_male)

# And also we need the male students math score > 50
df_male_math = df_male[df_male>50]

# total number of male mathscore greater than 50
total_male_50 = len(df_male_math)

# population propotion:
pop_pro = 0.8

# Calculation of the sample propotion:
samp_pro = total_male_50/total_male

In [399]:
# The Null and alternative Hypothesis:

# Ho : P <= 0.8
# H1 : P > 0.8

# Right tailed test.

In [411]:
# step1: Z-test:

z_value = stats.norm.isf(q=0.05)
print('The critical value for one tailed z-test is:',round(z_value,2))

pop_pro = 0.8
z_score = (samp_pro- pop_pro) / np.sqrt( (pop_pro* (1-pop_pro))/ total_male)
print('z-score:',z_score)

The critical value for one tailed z-test is: 1.64
z-score: 4.163394160018601


In [412]:
# step2 : p-value:

# here its right tailed use sf()
p_value = stats.norm.sf(z_score)
print('p-value:', p_value)

p-value: 1.5677570141208797e-05


In [410]:
# Step3: Confidence interval estimation:

interval = stats.norm.interval(0.95, loc = samp_pro, scale = np.sqrt(pop_pro * (1 - pop_pro))/(total_male))
print('The confidence interval is :',interval)

The confidence interval is : (0.8741532389361987, 0.8773995560948572)


# Conclusion:

- Here the z-score is greater than 1.64.
- the p-value is less than 0.05.
- also the confidence interval does not contain the value in the null hypothesis (i.e. 0.8).
- Thus, we reject the null hypothesis and we have enough evidence to conclude that the percentage of male students scoring more than 50 marks in Mathematics is more than 80%.



In [None]:
# The null and Alternative hypothesis:

# Ho : P <= 0.25
# H1 : P > 0.25            # if theres atmost take as greater than.

# Right tailed test

In [413]:
# Given data:

# Total number of owners had gone into bankruptcy.
n = 361

# Number of owners not consulted 
x = 105

# sample propotion
samp_pro = x/n

# population propotion
pop_pro = 0.25

In [415]:
# Test The claim with p-value techinque:

z_score = (samp_pro-pop_pro) / np.sqrt( (pop_pro*(1-pop_pro)) / n )
print('z-score:',round(z_score,2))

# its right tailed use sf()
p_value = stats.norm.sf(z_score)
print('P-value:',round(p_value,2))

z-score: 1.79
P-value: 0.04


# Conclusion:

- Here the p-value is less than 0.05. 
- Thus, we reject the null hypothesis and conclude that at least 25% of all businesses had not consulted before starting the business.


### 2.Two Sample Test

In [486]:
df_student = pd.read_csv('StudentsPerformance.csv')
df_student.head(2)

Unnamed: 0,gender,race/ethnicity,lunch,test preparation course,math score,reading score,writing score,total score,training institute
0,female,group B,standard,none,89,55,56,200,Nature Learning
1,female,group C,standard,completed,55,63,72,190,Nature Learning


In [491]:
# get the training institutes in the dataframe
df_student['training institute'].unique()

array(['Nature Learning', 'Speak Global Learning'], dtype=object)

In [500]:
# now we need to filter of the 2 seperate institute with the standard lunch

# Total sample of NL AND SGL
df_nl = df_student[df_student['training institute'] == 'Nature Learning']
df_sg = df_student[df_student['training institute'] == 'Speak Global Learning']

# Number of total NL and SGL
n1 = len(df_nl) 
n2 =len(df_sg)

# number of the students with standard lunch
n1_std= len(df_nl[df_nl['lunch'] == 'standard'])
n2_std= len(df_sg[df_sg['lunch'] == 'standard'])

In [494]:
# The Null and Alternative Hypothesis:

# Ho : P1 - P2 = 0
# H1 : P1 - P2 not= 0

# Two tailed test

In [495]:
# step1 : z-test

# its two tailed divide the alpha by 2.
z_val = np.abs(round(stats.norm.isf(q = 0.1/2), 2))
print('Critical value for two-tailed Z-test:', z_val)

Critical value for two-tailed Z-test: 1.64


In [496]:
# step2: z_score and p-value with inbuilt function:

import statsmodels.api as sm

z_prop, p_val = sm.stats.proportions_ztest(count = np.array([n1_std,n2_std]), 
                                             nobs = np.array([n1,n2]))

# print the value of test statstic and the corresponding p-value
print('Test statistic:', z_prop)
print('p-value:', p_val)

Test statistic: 0.7935300106078008
p-value: 0.4274690915859791


In [505]:
# Step1 : Null and alternative Hypothesis:

# Ho : P1 - P2 >=0
# H1 : P1 - P2 < 0

# left tailed test.

In [506]:
# copies of magazine A and B
num_A = 100
num_B = 70

# number of copies sold for magazine A  and B
x_A = 78
x_B = 65

# using the inbuilt function: ( use smaller its left tailed)
z_prop, p_val = sm.stats.proportions_ztest(count = np.array([x_A, x_B]), 
                                             nobs = np.array([num_A, num_B]),  
                                             alternative = 'smaller')
print('Test statistic:', z_prop)
print('p-value:', p_val)


Test statistic: -2.60830803458311
p-value: 0.004549551600547303


##### Conclusion:

- Here the p-value is less than 0.05. 
- Thus, we reject the null hypothesis and conclude that there is enough evidence to say that magazine B is more popular.

# Day 04

# Chi - Square Test:
- **Non parametric test** . Non-parametric tests do not require any assumptions on the parameter of the population from which the sample is taken. 
- The chi-square test statistic follows a Chi-square ($\chi^{2}$) distribution under the null hypothesis. **It can be used to check the relationship between the categorical variables.** 

# 1. Chi-Square Test for Goodness of Fit:
- This test is used to **compare the distribution of the categorical data** with the expected distribution.
- <p style='text-indent:6em'> <strong> $H_{0}$: There is no significant difference between the observed and expected frequencies from the expected distribution</strong></p>
- <p style='text-indent:6em'> <strong> $H_{1}$: There is a significant difference between the observed and expected frequencies from the expected distribution</strong></p>

In [514]:
# read the dataset for observed value.

df_s = pd.read_csv('students_data.csv')
df_s.head(2)

Unnamed: 0,gender,ethnicity,education,lunch,test_prep_course,math_score,reading_score,writing_score,total_score,training_institute
0,female,group B,bachelor's degree,standard,none,89,55,56,200,Nature Learning
1,female,group C,college,standard,completed,55,63,72,190,Nature Learning


In [517]:
# read the datset for the expected value.

df_d = pd.read_csv('demographic_data.csv')
df_d.head(2)

Unnamed: 0,education
0,bachelor's degree
1,college


### The Null and the Alternative Hypothesis is:
- Ho : There is no significant difference between the observed and expected values.
- H1 : There is significant difference between the observed and expected values.    

In [522]:
# calculate the count for each category of the variable 'education'
# dataset - 'students_data.csv'

obs_value = df_s['education'].value_counts()
obs_value

college               225
associate's degree    222
high school           197
Ph.D.                 179
bachelor's degree     118
master's degree        59
Name: education, dtype: int64

In [526]:

# Calculate the count of the each category of the variable 'education'.
# datset - 'demographic_data.csv'
exp_count = df_d['education'].value_counts()


# calculate the expected value.
exp_value = (expected_count/len(df_d))*len(df_s)
print(exp_value)


college               203.583062
associate's degree    195.439739
high school           179.153094
Ph.D.                 171.009772
bachelor's degree     153.094463
master's degree        97.719870
Name: education, dtype: float64


In [527]:
# create a list of observed values and expected values:
print("Observed Values: ", list(obs_value))
print("Expected Values: ", list(exp_value))

Observed Values:  [225, 222, 197, 179, 118, 59]
Expected Values:  [203.5830618892508, 195.4397394136808, 179.1530944625407, 171.0097719869707, 153.09446254071662, 97.7198697068404]


In [528]:
# For alpha = 0.1 and degree of freedom = 5 (len(obs_val - 1))

In [529]:
# Calculate the chi-square value for 90% confidence interval:

chi2_val = stats.chi2.isf(q=0.1,df=5)
print('The critical value for chi-square is:',round(chi2_val,2))

The critical value for chi-square is: 9.24


In [531]:
# Calculate the chi2_score and p-value with the inbuilt function:

from scipy.stats import chi2_contingency
from scipy.stats import chi2
from scipy.stats import chisquare

chi2_score, pval = chisquare(f_obs = obs_value, f_exp = exp_value)
print('chi2_score:',round(chi2_score,2))
print('P-value:',round(pval,2))

chi2_score: 31.4
P-value: 0.0


##### Conclusion:

- The above output shows that the chi-square test statistic is greater than 9.2364.
- and the p-value is less than 0.1.
- thus, we reject the null hypothesis and conclude that there is a significant difference between the observed and expected values.

### The null and alternative hypothesis is:

- H0: The manager's claim is correct
- H1: The manager's claim is not correct

In [532]:
# For alpha = 0.05 and degree of freedom 4-1 = 3.

In [577]:
# observerd value :
obs_val = [25, 50, 90, 15]

# expected count:
exp_count = [0.05, 0.38, 0.55, 0.02]  # in here always maintain the same order as in the given question

# Calculating the expected value:
# expected_value = (np.array(exp_count)*N)
np.round((np.array(exp_count)*180))
# above one is the similist way to cal expected value and then convert it into list manually
exp_val = [9,68,99,4]

In [578]:
# critical value techinque:

chi2_val = stats.chi2.isf(q=0.05,df=3)
print('The critical value for chi square test is:',round(chi2_val,2))

The critical value for chi square test is: 7.81


In [579]:
# Calculate the chi2_score and p-value with the inbuilt function:

from scipy.stats import chi2_contingency
from scipy.stats import chi2
from scipy.stats import chisquare

chi2_score, pval = chisquare(f_obs = obs_val, f_exp = exp_val)
print('chi2_score:',chi2_score)
print('P-value:',round(pval,2))

chi2_score: 64.2773321449792
P-value: 0.0


##### Conclusion:

- The above output shows that the chi-square test statistic is greater than 7.8147.
- and the p-value is less than 0.05.
- Thus, we reject the null hypothesis and conclude that manager's claim is not correct.

# 2. Chi-Square Test for Independence 
- This test is used to **test whether the categorical variables are independent or not**.

- 𝐻0 : The variables are independent

- 𝐻1 : The variables are not independent

In [580]:
df_s = pd.read_csv('students_data.csv')
df_s.head(2)

Unnamed: 0,gender,ethnicity,education,lunch,test_prep_course,math_score,reading_score,writing_score,total_score,training_institute
0,female,group B,bachelor's degree,standard,none,89,55,56,200,Nature Learning
1,female,group C,college,standard,completed,55,63,72,190,Nature Learning


### The Null and Alternative Hypothesis is:
- Ho : The variables gender and education are independent.
- H1 : The variables gender and education are not independent.

In [584]:
# crosstab the categorical data:
table = pd.crosstab(df_s['gender'],df_s['education'])

# lets take the values of the table as observed values:
obs_val = table.values
obs_val

array([[ 91, 116,  63, 117,  94,  36],
       [ 88, 106,  55, 108, 103,  23]], dtype=int64)

In [585]:
# the aplha is 0.05.
# Degree of the freedom is =  no. of observations - 1 (6-1) = 5

In [586]:
# lets calculate the critical value:
chi2_val = stats.chi2.isf(q=0.05,df=5)
print('The critical value for chi square test is:',round(chi2_val,2))

The critical value for chi square test is: 11.07


In [588]:
# let us calculate the chi2 score and pval for 2 categorical variable:

stat,p,dof,expected_value = chi2_contingency(observed = obs_val, correction=False)
print('chi2_score:',stat)
print('P-value:',p)
print('degree of freedom:',dof)
print('expected_value:',expected_value)


chi2_score: 3.5267538812534243
P-value: 0.6193433487137843
degree of freedom: 5
expected_value: [[ 92.543 114.774  61.006 116.325 101.849  30.503]
 [ 86.457 107.226  56.994 108.675  95.151  28.497]]


##### Conclusion:

- The above output shows that the chi-square test statistic is less than 11.0705.
- and the p-value is greater than 0.05.
- thus we fail to reject (i.e. accept) the null hypothesis and conclude that the variables gender and education are independent.

### The null and alternative hypothesis is:

- H0: The zygote type and infection with malaria parasite is independent
- H1: The zygote type and infection with malaria parasite is not independent

In [589]:
# Observed value:

obs_val = np.array([[93,51],[68,40]])

In [590]:
# alpha is 0.05
# dof = 2-1 = 1

In [591]:
# critical value method:

chi2_val = stats.chi2.isf(q=0.05,df=1)
print('the critical value is:',chi2_val)

the critical value is: 3.8414588206941285


In [593]:
# chi2_score and p-value

stat,p,dof,exp_val = chi2_contingency(observed=obs_val, correction=False)
print("Test statistic:",stat)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected values:", exp_val)

Test statistic: 0.07023411371237459
p-value: 0.790996215494177
Degrees of freedom: 1
Expected values: [[92. 52.]
 [69. 39.]]


##### Conclusion:

- The above output shows that the chi-square test statistic is less than 3.8415.
- and the p-value is greater than 0.05.
- Thus we fail to reject (i.e. accept) the null hypothesis and conclude that the zygote type and infection of the malaria parasite are independent.

# One-way ANOVA (analysis of variances)
- It is used to check the **equality of population means for more than two independent samples**.
- 𝐻0 : The averages of all treatments are the same.
- 𝐻1 : At least one treatment has a different average.

In [595]:
df_s = pd.read_csv('students_data.csv')
df_s.head(2)

Unnamed: 0,gender,ethnicity,education,lunch,test_prep_course,math_score,reading_score,writing_score,total_score,training_institute
0,female,group B,bachelor's degree,standard,none,89,55,56,200,Nature Learning
1,female,group C,college,standard,completed,55,63,72,190,Nature Learning


### The Null and Alternative Hypothesis is:

- H0: The average score of all races/ethnicities is same
- H1: At least one race/ethnicity has a different average score

In [597]:
# unique race/ethnicity in the data
df_s['ethnicity'].unique() 

array(['group B', 'group C', 'group A', 'group D', 'group E'],
      dtype=object)

In [None]:
# There are total 5 unique race/ethnicity in the dataset.

In [600]:
# lets separate the data in ethicity variable based on unique groups:

nA = df_s[df_s['ethnicity']=='group A']['total_score']
nB = df_s[df_s['ethnicity']=='group B']['total_score']
nC = df_s[df_s['ethnicity']=='group C']['total_score']
nD = df_s[df_s['ethnicity']=='group D']['total_score']
nE = df_s[df_s['ethnicity']=='group E']['total_score']

In [602]:
# Let us check the normality of the total marks of students from all the groups.
stat, p_value = stats.shapiro(df_s['total_score'])
print('p-value:', p_value)

p-value: 0.7420849204063416


##### From the above result, we can see that the p-value is greater than 0.05, thus we can say that the total marks of students from each group are normally distributed. Thus the assumption of normality is satisfied.

In [605]:
# Let us check the equality of variances.

stat, pval = stats.levene(nA,nB,nC,nD,nE)
print('P-Value:', pval)

P-Value: 0.12649444001357793


##### From the above result, we can see that the p-value is greater than 0.05, thus we can say that the population variances are equal for all the samples.

In [608]:
# obtain the number of unique ethnicity:
t =  df_s['ethnicity'].nunique()
print('t:',t)

# calculate the number of students in the each ethicity:
N = df_s['ethnicity'].value_counts().sum()
print('N:',N)

t: 5
N: 1000


##### Alpha = 0.05,     dof numerator ( t - 1 ) = 4  ,     dof denominator ( N - t ) = 995

In [609]:
# calculate the critical value for F-test:

f = stats.f.isf(q=0.05, dfn=4, dfd=995)
print('the critical value for f-test is:',f)

the critical value for f-test is: 2.3808758069291818


In [612]:
# Perform one way Anova:

f_test, pval = stats.f_oneway(nA,nB,nC,nD,nE)
print('f-score:', f_test)
print('p-value:', pval)     

f-score: 0.789109595922189
p-value: 0.5322937031083035


##### Conclusion:

- The above output shows that the test statistic is less than 2.3809.
- and the p-value is greater than 0.05.
- Thus we fail to reject (i.e. accept) the null hypothesis and conclude that the average score of all races/ethnicities is the same.

In [613]:
A = [68.7, 75.4, 70.9, 79.1, 78.2]
B = [62.7, 68.5, 63.1, 62.2, 60.3]
C = [55.9, 56.1, 57.3, 59.2, 50.1]
D = [80.7, 70.3, 80.9, 85.4, 82.3]

### The null and alternative hypothesis is:

- H0: The average tensile strength due to all the machines is the same
- H1: The average tensile strength due to at least one machines is different

In [614]:
# let us put all the values in the list strength to perform shapiro test:

strength = [68.7, 75.4, 70.9, 79.1, 78.2,62.7, 68.5, 63.1, 62.2, 60.3,55.9, 56.1, 57.3, 59.2, 50.1,80.7, 70.3, 80.9, 85.4, 82.3]

In [616]:
# perform Shapiro-Wilk test to test the normality

stat, pval = shapiro(strength)
print('p-value:', pval)

p-value: 0.3721875548362732


##### From the above result, we can see that the p-value is greater than 0.05, thus we can say that the tensile strengths due to all the machines are normally distributed. Thus the assumption of normality is satisfied.

In [617]:
# let us perform Levene's test for the equality of variances 

stat, pval = stats.levene(A,B,C,D)
print('P-value:',pval)

P-value: 0.7570021212992085


##### From the above result, we can see that the p-value is greater than 0.05, thus we can say that the population variances are equal for all the samples.

- Here t (=number of machines) = 4, N (=total observations) = 20

-  For ⍺ = 0.05 and degrees of freedom = (t-1, N-t) = (3, 16), calculate the critical value.

In [620]:
# let us find critical value for f-test

f_value = stats.f.isf(q=0.05, dfn=3, dfd=16)
print('the critical value for f-test is:',f_value)

the critical value for f-test is: 3.238871517453585


In [621]:
# let us find f-score and p_value:
# perform one way anova

f_score, pval = stats.f_oneway(A,B,C,D)
print('F-score:',f_score)
print('P-value:',pval)

F-score: 32.03072350199285
P-value: 5.375613532781072e-07


##### Conclusion:

- The above output shows that the test statistic is greater than 3.2389.
- and the p-value is less than 0.05.
- Thus we reject the null hypothesis and conclude that the average tensile strength due to at least one machine is different.

# Post-hoc Analysis:
- **If one-way ANOVA rejects the null hypothesis**; we conclude that at least one treatment has a different mean. 
- **The test does not distinguish a treatment with the different average value**. The post-hoc test or multi comparison test is used to identify such treatment(s).

In [647]:
# lets create a dataframe using given data:

df = pd.DataFrame(data = {'machine': ['machine_A','machine_B','machine_C','machine_D']*5, 
                                  'strength': [68.7, 62.7, 55.9, 80.7, 75.4, 68.5, 56.1, 70.3, 70.9, 63.1, 57.3, 80.9, 79.1, 
                                               62.2, 59.2, 85.4, 78.2, 60.3, 50.1, 82.3]})
df.head()

Unnamed: 0,machine,strength
0,machine_A,68.7
1,machine_B,62.7
2,machine_C,55.9
3,machine_D,80.7
4,machine_A,75.4


In [652]:
# perform tukey's range test to compare the mean efficiency for pair of machines
# pass the tensile strength to the parameter, 'data'
# pass the name of the machine to the parameter, 'groups'

!pip install scikit_posthocs
import scikit_posthocs

import statsmodels.stats.multicomp as mc
comp = mc.MultiComparison(data = df['strength'], groups=df['machine'])

# tuskey's range test:
post_hoc = comp.tukeyhsd()

# print the summary table
post_hoc.summary()



group1,group2,meandiff,p-adj,lower,upper,reject
machine_A,machine_B,-11.1,0.0044,-18.8842,-3.3158,True
machine_A,machine_C,-18.74,0.001,-26.5242,-10.9558,True
machine_A,machine_D,5.46,0.2265,-2.3242,13.2442,False
machine_B,machine_C,-7.64,0.0553,-15.4242,0.1442,False
machine_B,machine_D,16.56,0.001,8.7758,24.3442,True
machine_C,machine_D,24.2,0.001,16.4158,31.9842,True


##### Conclusion:

- The reject=False for pairs (machine_A, machine_D) and (machine_B, machine_C) denotes that **we fail to reject the null hypothesis**.
- and conclude that the **average tensile strength due to machine_A and machine_D, machine_B and machine_C is same**.
- For the pairs (machine_A, machine_B), (machine_A, machine_C), (machine_B, machine_D), and (machine_C, machine_D) the **average tensile strength is not the same**.
- The values in the **columns lower and upper represent the lower and upper bound of the 95% confidence interval for the mean difference**.
