In [1]:
import pandas as pd
import numpy as np
from pyDOE2 import *

In [2]:
sign = fracfact('a b c abc ab ac bc')
print("Signs for a, b, c, d, ab, ac, bc")
print(sign)
print("Signs for just a, b, c, and d")
grid = sign[:, :4]
grid

Signs for a, b, c, d, ab, ac, bc
[[-1. -1. -1. -1.  1.  1.  1.]
 [ 1. -1. -1.  1. -1. -1.  1.]
 [-1.  1. -1.  1. -1.  1. -1.]
 [ 1.  1. -1. -1.  1. -1. -1.]
 [-1. -1.  1.  1.  1. -1. -1.]
 [ 1. -1.  1. -1. -1.  1. -1.]
 [-1.  1.  1. -1. -1. -1.  1.]
 [ 1.  1.  1.  1.  1.  1.  1.]]
Signs for just a, b, c, and d


array([[-1., -1., -1., -1.],
       [ 1., -1., -1.,  1.],
       [-1.,  1., -1.,  1.],
       [ 1.,  1., -1., -1.],
       [-1., -1.,  1.,  1.],
       [ 1., -1.,  1., -1.],
       [-1.,  1.,  1., -1.],
       [ 1.,  1.,  1.,  1.]])

In [3]:
factor_names = ['approach', 'model', 'cpu', 'parallelism']
repetitions = 5

# Guessing, make sure to fix these values later
levels = {
    factor_names[0]: {
        'low': 'lazy',
        'high': 'predictive',
    },
    factor_names[1]: {
        'low': 'Cifar10',
        'high': 'FashionMNISTCNN',
    },
    factor_names[2]: {
        'low': '1 core',
        'high': '4 cores',
    },
    factor_names[3]: {
        'low': '1',
        'high': '4',
    }
}

runs_levels = [tuple([levels[fac]['low'] if r[i] == -1 else levels[fac]['high'] for i, fac in enumerate(factor_names)]) for r in grid]

runs_levels

[('lazy', 'Cifar10', '1 core', '1'),
 ('predictive', 'Cifar10', '1 core', '4'),
 ('lazy', 'FashionMNISTCNN', '1 core', '4'),
 ('predictive', 'FashionMNISTCNN', '1 core', '1'),
 ('lazy', 'Cifar10', '4 cores', '4'),
 ('predictive', 'Cifar10', '4 cores', '1'),
 ('lazy', 'FashionMNISTCNN', '4 cores', '1'),
 ('predictive', 'FashionMNISTCNN', '4 cores', '4')]

In [4]:
arrays = [
    [x[0] for x in runs_levels],
    [x[1] for x in runs_levels],
    [x[2] for x in runs_levels],
    [x[3] for x in runs_levels],
]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=factor_names)
index

MultiIndex([(      'lazy',         'Cifar10',  '1 core', '1'),
            ('predictive',         'Cifar10',  '1 core', '4'),
            (      'lazy', 'FashionMNISTCNN',  '1 core', '4'),
            ('predictive', 'FashionMNISTCNN',  '1 core', '1'),
            (      'lazy',         'Cifar10', '4 cores', '4'),
            ('predictive',         'Cifar10', '4 cores', '1'),
            (      'lazy', 'FashionMNISTCNN', '4 cores', '1'),
            ('predictive', 'FashionMNISTCNN', '4 cores', '4')],
           names=['approach', 'model', 'cpu', 'parallelism'])

In [5]:
dim_sizes = {'runs': len(runs_levels), 'repetitions': repetitions}
df = pd.DataFrame(
    np.abs(np.random.randn(dim_sizes['runs'], dim_sizes['repetitions']))*500,
#     np.zeros((dim_sizes['runs'], dim_sizes['repetitions']))+1,
    index=index
)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0,1,2,3,4
approach,model,cpu,parallelism,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
lazy,Cifar10,1 core,1,28.113781,51.370671,327.524931,445.910746,833.331273
predictive,Cifar10,1 core,4,622.754485,249.580581,768.95504,525.829807,407.598666
lazy,FashionMNISTCNN,1 core,4,388.341814,423.380094,799.410214,82.015917,9.416894
predictive,FashionMNISTCNN,1 core,1,1128.29471,141.430988,114.584057,1307.411514,357.965533
lazy,Cifar10,4 cores,4,861.384568,469.708888,512.556791,907.393045,547.133312
predictive,Cifar10,4 cores,1,181.673002,176.288688,266.485645,950.669089,578.529232
lazy,FashionMNISTCNN,4 cores,1,1239.203677,497.125715,693.119009,651.177075,630.251712
predictive,FashionMNISTCNN,4 cores,4,166.679302,244.732892,259.732532,44.258924,670.217773


In [6]:
q = np.zeros((2**(4-1)))  # +1 for a spot for q0
# now add column of 1's for the 'I' column
sign_table = np.array([ np.concatenate((np.array([1.]), x)) for x in sign])
q, sign_table

(array([0., 0., 0., 0., 0., 0., 0., 0.]),
 array([[ 1., -1., -1., -1., -1.,  1.,  1.,  1.],
        [ 1.,  1., -1., -1.,  1., -1., -1.,  1.],
        [ 1., -1.,  1., -1.,  1., -1.,  1., -1.],
        [ 1.,  1.,  1., -1., -1.,  1., -1., -1.],
        [ 1., -1., -1.,  1.,  1.,  1., -1., -1.],
        [ 1.,  1., -1.,  1., -1., -1.,  1., -1.],
        [ 1., -1.,  1.,  1., -1., -1., -1.,  1.],
        [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]]))

See box 18.1 for the following steps

$$q_j = \frac{1}{2^{k-1}} \sum_{i=1}^{k-1}S_{ij}\bar{y}_i$$

Where $k=4$ and $S_{ij}$ is the $(i,j)$ entry in the `sign_table`.

In [7]:
for j in range(len(q)):
    q[j] = (1/(2**(4-1))) * np.sum([sign_table[i][j] * df.iloc[i].mean() for i in range(2**(4-1))])
q

array([ 489.03856469,  -30.85494167,    3.39895255,   38.37747887,
        -40.98448777,  -18.05175306, -142.63439398,  -21.16513506])

To calculate the sum of squares for each effect (as well as SS0/q0)

$$SSj = 2^{k-1}rq_j^2$$ for $j=0,1, 2,\dots,2^{k-1}-1$

In [8]:
SS = np.zeros((2**(4-1)))
for j in range(len(SS)):
    SS[j] = (2**(4-1)) * repetitions * (q[j] ** 2)
SS

array([9.56634871e+06, 3.80810970e+04, 4.62115137e+02, 5.89132354e+04,
       6.71891295e+04, 1.30346315e+04, 8.13782814e+05, 1.79185177e+04])

Now for $SSY$, $SST$, and $SSE$
$$SSY=\sum_{i=1}^{2^{k-1}}\sum_{j=1}^{r}y_{ij}^2$$
$$SST = SSY - SS0$$
$$SSE = SST - \sum_{j=1}^{2^{k-1}-1}SSj$$

In [9]:
SSY = np.sum([np.sum([df.iloc[i, j]**2 for j in range(repetitions)]) for i in range(2**(4-1))])
SST = SSY - SS[0]
SSE = SST - np.sum([SS[j] for j in range(1, 2**(4-1))])
SSY, SST, SSE

(14019912.684396196, 4453563.974179788, 3444182.433995825)

% of `y`'s variation explained by `j`th effect
$$(SSj/SST) \cdot 100\%$$

In [10]:
var_expl = np.zeros((2**(4-1)))
for j in range(2**(4-1)):
    var_expl[j] = (SS[j] / SST) * 100
[round(x, 2) for x in var_expl[1:]]  # just to pretty-print

[0.86, 0.01, 1.32, 1.51, 0.29, 18.27, 0.4]

Standard deviation of errors:
$$s_e = \sqrt{\frac{SSE}{2^{k-1}(r-1)}}$$

In [11]:
s_e = np.sqrt(SSE/(2**(4-1)*(repetitions-1)))
s_e

328.0711829197583

In [12]:
s_qj = np.array([s_e / np.sqrt(2**(4-1)*repetitions)] * (2**(4-1)))
s_qj

array([51.87260863, 51.87260863, 51.87260863, 51.87260863, 51.87260863,
       51.87260863, 51.87260863, 51.87260863])

## Confidence Intervals
To find the confidence interval for effect $i$ the following equation is used:
$$q_i \pm t_{[1-\alpha/2; 2^{k-1}(r-1)]}s_{qi}$$

In [13]:
from scipy import stats

In [14]:
conf_interval = np.zeros_like((q))[1:]  # remove q0
dof = 2**(4-1)*(repetitions-1)
for i in range(len(conf_interval)):
    interval = stats.t.cdf(q[i+1], dof) * s_qj[i+1]
    conf_interval[i] = interval
[(round(q[i+1] - interval, 2), round(q[i+1] + interval, 2)) for i, interval in enumerate(conf_interval)]

[(-30.85, -30.85),
 (-48.43, 55.22),
 (-13.5, 90.25),
 (-40.98, -40.98),
 (-18.05, -18.05),
 (-142.63, -142.63),
 (-21.17, -21.17)]

## ANOVA

See table 22.5

In [15]:
MSE = s_e**2
# all degrees of freedoms of factors are 1 (since all levels are just 2), so MS is the same as SS
# just have to slice [1:] to get rid of the SS0 term
MS = np.copy(SS[1:])

F_vals = MS/MSE

p_vals = np.zeros_like(F_vals)
error_degree_of_freedom = 2**(4-1)*(repetitions-1)

for i in range(len(F_vals)):   
    p_vals[i] = 1-stats.f.cdf(F_vals[i], 1, error_degree_of_freedom)

print(f"MSE = {round(MSE, 2)}, dof for error = {error_degree_of_freedom}")
pd.DataFrame(np.stack((MS, F_vals, p_vals)).T, columns=["Mean Square", "F Computed", "F Table"])

MSE = 107630.7, dof for error = 32


Unnamed: 0,Mean Square,F Computed,F Table
0,38081.097017,0.353813,0.556146
1,462.115137,0.004294,0.948164
2,58913.23539,0.547365,0.46479
3,67189.129524,0.624256,0.435286
4,13034.631541,0.121105,0.730118
5,813782.813882,7.56088,0.009728
6,17918.517692,0.166481,0.685975
