# Lab | Inferential statistics - T-test & P-value


In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# 1) One-tailed t-test 
data= pd.read_csv('files_for_lab/machine.txt', sep='\t', encoding='UTF16')

In [13]:
data.columns=['New machine', 'Old machine']

In [14]:
data

Unnamed: 0,New machine,Old machine
0,42.1,42.7
1,41.0,43.6
2,41.3,43.8
3,41.8,43.3
4,42.4,42.5
5,42.8,43.5
6,43.2,43.1
7,42.3,41.7
8,41.8,44.0
9,42.7,44.1


In [31]:
# Null hypothesis: new machine packs faster than old machine
# H0: Mean(New machine) < Mean(Old machine) 
# H1: Mean(New machine) >= Mean(Old machine) 
# signicance level = 5%
t_stat, p_value = st.ttest_ind(data['New machine'], data['Old machine'], alternative='greater')
print("t-statistic:" , t_stat)
print("p-value:" , p_value)

t-statistic: -3.3972307061176026
p-value: 0.9983944287496127


In [47]:
# alpha= 0.05, and since the p-value>alpha, we accept the null hypothesis that the new machine is faster than the old machine

In [35]:
# 2) Matched Pairs Test
pokemon = pd.read_csv('files_for_lab/pokemon.csv')

In [36]:
pokemon

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


In [42]:
# Null hypothesis: Defense and attack scores are equal
# H0: Mean defense scores = Mean attack scores
# H1: Mean defense scores != Mean attack scores
# signicance level = 0.05
t_stat, p_val= st.ttest_rel(pokemon['Defense'],pokemon['Attack'])
print("t-statistic:" , t_stat)
print("p-value:" , p_val)

t-statistic: -4.325566393330478
p-value: 1.7140303479358558e-05


In [45]:
tc = st.t.ppf(1-(0.05/2),df= pokemon.shape[0] - 1)
tc
# t-statistic is < -tc
# p-value > 1-(0.05/2)
# therefore we reject the null hypothesis

1.9629374611056056

# Inferential statistics - ANOVA


In [None]:
# 3) 
# Null hypothesis: All the sample means are the same 𝐻0:𝜇1=𝜇2=𝜇3=⋯=𝜇𝑛
# Alternate hypothesis: Not all the sample means are equal 𝐻1:𝜇𝑖≠𝜇𝑗
# Significance level: 0.05

In [53]:
df= pd.read_excel('files_for_lab/anova_lab_data.xlsx')
df

Unnamed: 0,Power,Etching Rate
0,160 W,5.43
1,180 W,6.24
2,200 W,8.79
3,160 W,5.71
4,180 W,6.71
5,200 W,9.2
6,160 W,6.22
7,180 W,5.98
8,200 W,7.9
9,160 W,6.01


In [67]:
df.columns = ['Power', 'Etching Rate']
group_df = df.groupby('Power')['Etching Rate'].agg(Etching_mean='mean').reset_index()
group_df

Unnamed: 0,Power,Etching_mean
0,160 W,5.792
1,180 W,6.238
2,200 W,8.318


In [68]:
S2t = 0
for p in df['Power'].unique():
    ng = len(df[df['Power'] == p])
    S2t  += ng * ( ( df[df['Power'] == p]['Etching Rate'].mean() - df['Etching Rate'].mean() ) ** 2)
S2t /= ( df['Power'].nunique() - 1 ) 
print("The value of S2t is {:.2f}".format(S2t))

The value of S2t is 9.09


In [70]:
S2E = 0
for p in df['Power'].unique():
    for rate in df[df['Power'] == p]['Etching Rate']:
        S2E += ( rate - df[df['Power'] == p]['Etching Rate'].mean() ) ** 2
S2E /= ( len(df) - df['Power'].nunique() )

print("The value of S2E is {:.2f}".format(S2E))

The value of S2E is 0.25


In [73]:
F = S2t / S2E
print("The value of F is {:.2f}".format(F))

The value of F is 36.88


In [74]:
d1 = df['Power'].nunique() - 1
d2 = len(df) - df['Power'].nunique()

print("Number of degrees of freedom d1: ",d1)
print("Number of degrees of freedom d2: ",d2)

Number of degrees of freedom d1:  2
Number of degrees of freedom d2:  12


In [75]:
st.f.cdf(F,dfn=d1, dfd=d2)

0.9999924934157276

In [77]:
1 - st.f.cdf(F,dfn=d1, dfd=d2)
# This is less than 0.05, therefore we reject the null hypothesis

7.5065842723986975e-06

In [78]:
# 4) 
print(st.f_oneway(df[df['Power'] == '160 W']['Etching Rate'],df[df['Power'] == '180 W']['Etching Rate'],
                  df[df['Power'] == '200 W']['Etching Rate']))

F_onewayResult(statistic=36.87895470100505, pvalue=7.506584272358903e-06)


In [79]:
# As the p_value is < 0.05 we reject the H0.