### One Sample T-test
##### The One Sample t Test determines whether the sample mean is statistically different from a known or hypothesised population mean

In [1]:
from scipy.stats import ttest_1samp
import numpy as np

In [2]:
age = [20,25,30,31,23,45,32,43,32,23,24,19,28,24,30,30,23,22,20,20,20,20,21,20]
t, pval = ttest_1samp(age, 25)
print('t statistic is', t)
print('pvalue is', pval)
if pval<0.05:
    print('Reject null hypothesis')
else:
    print('Failed to reject null hypothesis')
    

t statistic is 0.7245417462550013
pvalue is 0.47603981144203555
Failed to reject null hypothesis


### 2 Sample T-test
##### Compares the means of two independent groups in order to determine whether there is statistical evidence that the associated population means are significantly different

In [13]:
from scipy.stats import ttest_ind
data_1 = np.random.uniform(20,29,(20,))

In [14]:
data_2 = np.random.uniform(20.8,28,(20,))

In [18]:
print('the mean of data 1 is ',data_1.mean())
print('the mean of data 2 is ',data_2.mean())
print()
print(data_1)
print()
print(data_2)
print()
print('the standard deviation of data 1 is ',data_1.std())
print('the standard deviation of data 2 is ',data_2.std())
ts,pv = ttest_ind(data_1,data_2)
print()
print('the t statistic is ',ts)
print('the pval is', pv)
if pv<0.05:
    print('reject the null hypothesis')
else:
    print('failed to reject')

the mean of data 1 is  25.34022617212307
the mean of data 2 is  24.44094057743191

[23.61742533 28.28730889 23.19981179 28.37222875 28.40537935 24.55712437
 25.09834027 26.13592253 22.90620135 28.36039545 22.82544942 25.87952016
 24.35788149 22.76471131 22.55186421 26.88772238 26.28403133 28.84047134
 20.50050871 26.972225  ]

[26.81129308 21.84995418 23.89519774 24.85101729 24.84994685 25.15549958
 22.32533749 26.20869408 23.00889439 23.25311481 24.29415759 26.1784719
 26.27751035 22.64240246 21.59564684 26.22483759 22.22400068 27.08206166
 23.8876333  26.20313967]

the standard deviation of data 1 is  2.3956773625319725
the standard deviation of data 2 is  1.7430746761471738

the t statistic is  1.323082895427855
the pval is 0.19371438748788994
failed to reject


### Paired Sample T-test
##### Paired sampled t-test :- The paired sample t-test is also called dependent sample t-test. It’s an univariate test that tests for a significant difference between 2 related variables

In [19]:
import pandas as pd

In [23]:
data = {'patient':[1,2,3,4,5], 'sex':['Male','Male','Male','Male','Male'], 'agegrp':['30-45','30-45','30-45','30-45','30-45'], 'bp_before':[143,163,153,153,146], 'bp_after':[153,170,168,142,141]}
df = pd.DataFrame(data)
df

Unnamed: 0,patient,sex,agegrp,bp_before,bp_after
0,1,Male,30-45,143,153
1,2,Male,30-45,163,170
2,3,Male,30-45,153,168
3,4,Male,30-45,153,142
4,5,Male,30-45,146,141


In [24]:
from scipy.stats import ttest_rel

In [26]:
ttest, pval = ttest_rel(df['bp_before'], df['bp_after'])
print('the t statistic is ',ttest)
print('the pval is', pval)
print()
if pval<0.05:
    print("reject null hypothesis")
else:
    print("failed to reject the null hypothesis")

the t statistic is  -0.6609539078176221
the pval is 0.5447691927746274

failed to reject the null hypothesis


### One Sample Z Test

In [28]:
from statsmodels.stats import weightstats as ws

In [29]:
df['bp_before'].mean()

151.6

In [30]:
ztest,pval = ws.ztest(df['bp_before'], value=152)

In [31]:
pval

0.9079196418925185

In [32]:
if pval<0.05:
    print("reject null hypothesis")
else:
    print("failed to reject the null hypothesis")

failed to reject the null hypothesis


### 2 Sample z test

In [34]:
ztest,pval = ws.ztest(df['bp_before'], df['bp_after'],alternative='two-sided')

In [35]:
pval

0.6511994870939034

### Chi Square test

In [49]:
df_ti = pd.read_csv(r'C:\Users\DeLL\Documents\Python Files\Python by Nitish- 26th Nov 2023\Day15, 28.01.24\datasets\titanic.csv')

In [50]:
df_ti

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin,Name,Ticket
0,0,3,male,22.0,1,0,7.2500,Southampton,,"Braund, Mr. Owen Harris",A/5 21171
1,1,1,female,38.0,1,0,71.2833,Cherbourg,C85,"Cumings, Mrs. John Bradley (Florence Briggs Th...",PC 17599
2,1,3,female,26.0,0,0,7.9250,Southampton,,"Heikkinen, Miss. Laina",STON/O2. 3101282
3,1,1,female,35.0,1,0,53.1000,Southampton,C123,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",113803
4,0,3,male,35.0,0,0,8.0500,Southampton,,"Allen, Mr. William Henry",373450
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,Southampton,,"Montvila, Rev. Juozas",211536
887,1,1,female,19.0,0,0,30.0000,Southampton,B42,"Graham, Miss. Margaret Edith",112053
888,0,3,female,,1,2,23.4500,Southampton,,"Johnston, Miss. Catherine Helen ""Carrie""",W./C. 6607
889,1,1,male,26.0,0,0,30.0000,Cherbourg,C148,"Behr, Mr. Karl Howell",111369


In [52]:
df_Cro = pd.crosstab(df_ti['Sex'], df_ti['Embarked'])

In [53]:
df_Cro

Embarked,Cherbourg,Queenstown,Southampton
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,73,36,203
male,95,41,441


In [54]:
from scipy.stats import chi2_contingency as chi

In [55]:
chi2_stat, p_val, dof, expected = chi(df_Cro)

# Print results
print("\nChi-square statistic:", chi2_stat)
print("P-value:", p_val)
print("Degrees of freedom:", dof)
print("Expected frequencies table:")
print(expected)


Chi-square statistic: 13.355630515001746
P-value: 0.0012585245232290144
Degrees of freedom: 2
Expected frequencies table:
[[ 58.96062992  27.02362205 226.01574803]
 [109.03937008  49.97637795 417.98425197]]


In [56]:
if p_val<0.05:
    print('Reject NUll')
else:
    print('Failed to reject')

Reject NUll


### One way ANOVA

In [36]:
df_anova = pd.read_csv(r"probability\Hypotheis testing Code+Data\PlantGrowth.csv")

In [39]:
df_anova = df_anova.iloc[:,[1,2]]

In [40]:
df_anova

Unnamed: 0,weight,group
0,4.17,ctrl
1,5.58,ctrl
2,5.18,ctrl
3,6.11,ctrl
4,4.5,ctrl
5,4.61,ctrl
6,5.17,ctrl
7,4.53,ctrl
8,5.33,ctrl
9,5.14,ctrl


In [44]:
grps = df_anova['group'].unique()

In [45]:
grps

array(['ctrl', 'trt1', 'trt2'], dtype=object)

In [62]:
df_anova['weight'][df_anova.group == 'trt2']

20    6.31
21    5.12
22    5.54
23    5.50
24    5.37
25    5.29
26    4.92
27    6.15
28    5.80
29    5.26
Name: weight, dtype: float64

In [63]:
mg = {i:df_anova['weight'][df_anova.group == i] for i in grps}

In [64]:
mg

{'ctrl': 0    4.17
 1    5.58
 2    5.18
 3    6.11
 4    4.50
 5    4.61
 6    5.17
 7    4.53
 8    5.33
 9    5.14
 Name: weight, dtype: float64,
 'trt1': 10    4.81
 11    4.17
 12    4.41
 13    3.59
 14    5.87
 15    3.83
 16    6.03
 17    4.89
 18    4.32
 19    4.69
 Name: weight, dtype: float64,
 'trt2': 20    6.31
 21    5.12
 22    5.54
 23    5.50
 24    5.37
 25    5.29
 26    4.92
 27    6.15
 28    5.80
 29    5.26
 Name: weight, dtype: float64}

In [65]:
from scipy.stats import f_oneway as f_o

In [67]:
f_test,p_val = f_o(mg['ctrl'], mg['trt1'], mg['trt2'])
print("p-value for significance is: ", pval)
if pval<0.05:
    print("reject null hypothesis")
else:
    print("failed to reject the null hypothesis")

p-value for significance is:  0.6511994870939034
failed to reject the null hypothesis


In [69]:
tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735,
             0.0659, 0.0923, 0.0836]
newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835,
           0.0725]
petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105]
magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764,
           0.0689]
tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045]

F,p = f_o(tillamook, newport, petersburg, magadan, tvarminne)
print("P Value calculated is ", p)

if p<0.05:
    print("We will reject the null hypothesis")
else:
    print("failed to reject the null hypothesis")

P Value calculated is  0.0002812242314534544
We will reject the null hypothesis


### Two Way Anova

In [70]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [71]:
data = {'water': np.repeat(['daily', 'weekly'], 15),
                   'sun': np.tile(np.repeat(['low', 'med', 'high'], 5), 2),
                   'height': [6, 6, 6, 5, 6, 5, 5, 6, 4, 5,
                              6, 6, 7, 8, 7, 3, 4, 4, 4, 5,
                              4, 4, 4, 4, 4, 5, 6, 6, 7, 8]}

In [73]:
df_tw = pd.DataFrame(data)

In [74]:
#perform 2 way anova

In [76]:
df_tw.head()

Unnamed: 0,water,sun,height
0,daily,low,6
1,daily,low,6
2,daily,low,6
3,daily,low,5
4,daily,low,6


In [77]:
model = ols('height ~ C(water) + C(sun) + C(water):C(sun)', data=df_tw).fit()

In [79]:
res = sm.stats.anova_lm(model, typ = 2)

In [80]:
res

Unnamed: 0,sum_sq,df,F,PR(>F)
C(water),8.533333,1.0,16.0,0.000527
C(sun),24.866667,2.0,23.3125,2e-06
C(water):C(sun),2.466667,2.0,2.3125,0.120667
Residual,12.8,24.0,,


In [89]:
for i in range(3):
    print(f'The null hypothesis is rejected for --> {res.index.values[i]}' if res['PR(>F)'].values[i]<0.05 else f'Failed to reject null hypothesis for --> {res.index.values[i]}')

The null hypothesis is rejected for --> C(water)
The null hypothesis is rejected for --> C(sun)
Failed to reject null hypothesis for --> C(water):C(sun)
