## Lab Inferential Statistics

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
test_txt = pd.read_csv('machine.txt', encoding = 'utf_16', sep= '\t')

In [37]:
test_txt # Already manually created the column (below). I just did this afterwards to see if I can manage to import the file

Unnamed: 0,New machine,Old machine
0,42.1,42.7
1,41.0,43.6
2,41.3,43.8
3,41.8,43.3
4,42.4,42.5
5,42.8,43.5
6,43.2,43.1
7,42.3,41.7
8,41.8,44.0
9,42.7,44.1


In [15]:
new_machine = [42.1, 41, 41.3, 41.8, 42.4, 42.8, 43.2, 42.3, 41.8, 42.7]

old_machine = [42.7, 43.6, 43.8, 43.3, 42.5, 43.5, 43.1, 41.7, 44, 44.1]

In [16]:
d = {'old_machine':old_machine,'new_machine':new_machine}

{'old_machine': [42.7, 43.6, 43.8, 43.3, 42.5, 43.5, 43.1, 41.7, 44, 44.1], 'new_machine': [42.1, 41, 41.3, 41.8, 42.4, 42.8, 43.2, 42.3, 41.8, 42.7]}


{'old_machine': [42.7, 43.6, 43.8, 43.3, 42.5, 43.5, 43.1, 41.7, 44, 44.1],
 'new_machine': [42.1, 41, 41.3, 41.8, 42.4, 42.8, 43.2, 42.3, 41.8, 42.7]}

In [17]:
data = pd.DataFrame(d)

In [19]:
data.head(11)

Unnamed: 0,old_machine,new_machine
0,42.7,42.1
1,43.6,41.0
2,43.8,41.3
3,43.3,41.8
4,42.5,42.4
5,43.5,42.8
6,43.1,43.2
7,41.7,42.3
8,44.0,41.8
9,44.1,42.7


## Part 1 Machines - One tailed sample test

In [33]:
# Actual means of the columns
display(data['old_machine'].mean())
display(data['new_machine'].mean())

43.230000000000004

42.14

In [None]:
# H0: Means of old and new machine are the same
# H1: Means of old and new machine are not the same / either higher or lower

In [30]:
old_machine_mean = round(np.mean(data['old_machine']), 3)
st.ttest_1samp(data['new_machine'],old_machine_mean)

Ttest_1sampResult(statistic=-5.043318535038297, pvalue=0.0006966376076759338)

In [32]:
print('p value (single tailed): ', st.ttest_1samp(data['new_machine'], 43.23)[1]/2)

p value (single tailed):  0.0003483188038379669


In [None]:
# The p-value is below 0.05. As the t-statistic is negative, the new machine is quicker.

In [20]:
# Independent Sample Test (just for fun)
# H0: Means of old and new machine are the same
# H1: Means of old and new machine are not the same / either higher or lower
st.ttest_ind(data['old_machine'], data['new_machine'], equal_var=False)

Ttest_indResult(statistic=3.397230706117603, pvalue=0.0032422494663179747)

In [34]:
# The p-value is below 0.05. The average time of the new machine is indeed lower.

## Part 2: Pokemon -  Matched Pair Test

In [26]:
pokemon = pd.read_csv('pokemon.csv')
pokemon.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [None]:
# H0: Attack and Defense Scores are equal(of each individual pokemon)
# H1: Attack and Defense scores are not equal (of each individual pokemon)

In [24]:
st.ttest_rel(pokemon['Attack'], pokemon['Defense'])

Ttest_relResult(statistic=4.325566393330478, pvalue=1.7140303479358558e-05)

In [28]:
# The p-value is very low, so H0 must be rejected and it is safe to say, that there is a significant difference.
# The means of the 
display(pokemon['Attack'].mean())
display(pokemon['Defense'].mean())

79.00125

73.8425

## Extra: Anova Test

In [None]:
# Comparing multiple groups at the same time

In [39]:
anova = pd.read_excel('anova_lab_data.xlsx')

In [51]:
print(anova.shape)
anova.head(15)

(15, 2)


Unnamed: 0,power,etching_rate
0,160 W,5.43
1,180 W,6.24
2,200 W,8.79
3,160 W,5.71
4,180 W,6.71
5,200 W,9.2
6,160 W,6.22
7,180 W,5.98
8,200 W,7.9
9,160 W,6.01


In [48]:
print(anova.columns)

Index(['Power ', 'Etching Rate'], dtype='object')


In [50]:
anova = anova.rename(columns={'Power ': 'power', 'Etching Rate': 'etching_rate'})

In [95]:
anova['power_count'] = anova.groupby('power').cumcount() ##is the new index 

anova_pivot = anova.pivot(index='power_count', columns='power', values='etching_rate')
anova_pivot.columns = ['160_w', '180_w', '200_w']
anova_pivot.head()

Unnamed: 0_level_0,160_w,180_w,200_w
power_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5.43,6.24,8.79
1,5.71,6.71,9.2
2,6.22,5.98,7.9
3,6.01,5.66,8.15
4,5.59,6.6,7.55


In [None]:
# H0: All Watt means are the same
# H1: The means are not the same, i.e. watts matter
# Significance level is 0.05 ( but how can we change that, assume this is the standard)
# Degrees of freedom is n-1 (I think), so this should be 14 here?

In [97]:
st.f_oneway(anova_pivot['160_w'],anova_pivot['180_w'],anova_pivot['200_w'])

F_onewayResult(statistic=36.87895470100505, pvalue=7.506584272358903e-06)

In [98]:
anova_pivot.mean()

160_w    5.792
180_w    6.238
200_w    8.318
dtype: float64

In [None]:
# The amount of watts matters critically for the etching rate. Reject H0