In [1]:
import pandas as pd
import numpy as np
from pyDOE2 import *
import csv

In [2]:
sign = fracfact('a b c abc ab ac bc')
print("Signs for a, b, c, d, ab, ac, bc")
print(sign)
print("Signs for just a, b, c, and d")
grid = sign[:, :4]
grid

Signs for a, b, c, d, ab, ac, bc
[[-1. -1. -1. -1.  1.  1.  1.]
 [ 1. -1. -1.  1. -1. -1.  1.]
 [-1.  1. -1.  1. -1.  1. -1.]
 [ 1.  1. -1. -1.  1. -1. -1.]
 [-1. -1.  1.  1.  1. -1. -1.]
 [ 1. -1.  1. -1. -1.  1. -1.]
 [-1.  1.  1. -1. -1. -1.  1.]
 [ 1.  1.  1.  1.  1.  1.  1.]]
Signs for just a, b, c, and d


array([[-1., -1., -1., -1.],
       [ 1., -1., -1.,  1.],
       [-1.,  1., -1.,  1.],
       [ 1.,  1., -1., -1.],
       [-1., -1.,  1.,  1.],
       [ 1., -1.,  1., -1.],
       [-1.,  1.,  1., -1.],
       [ 1.,  1.,  1.,  1.]])

In [3]:
factor_names = ['Approach', 'Model', 'CPU', 'Parallelism']
interaction_names = [f'{factor_names[0]} x {factor_names[1]}', f'{factor_names[0]} x {factor_names[2]}', f'{factor_names[1]} x {factor_names[2]}']
all_names = factor_names + interaction_names
repetitions = 5

# Guessing, make sure to fix these values later
levels = {
    factor_names[0]: {
        'low': 'lazy',
        'high': 'predictive',
    },
    factor_names[1]: {
        'low': 'Cifar10',
        'high': 'FashionMNISTCNN',
    },
    factor_names[2]: {
        'low': '1 core',
        'high': '4 cores',
    },
    factor_names[3]: {
        'low': '1',
        'high': '4',
    }
}

runs_levels = [tuple([levels[fac]['low'] if r[i] == -1 else levels[fac]['high'] for i, fac in enumerate(factor_names)]) for r in grid]

runs_levels

[('lazy', 'Cifar10', '1 core', '1'),
 ('predictive', 'Cifar10', '1 core', '4'),
 ('lazy', 'FashionMNISTCNN', '1 core', '4'),
 ('predictive', 'FashionMNISTCNN', '1 core', '1'),
 ('lazy', 'Cifar10', '4 cores', '4'),
 ('predictive', 'Cifar10', '4 cores', '1'),
 ('lazy', 'FashionMNISTCNN', '4 cores', '1'),
 ('predictive', 'FashionMNISTCNN', '4 cores', '4')]

In [4]:
with open('sign-table.csv', 'w', newline='') as csvfile:
    sign_table_writer = csv.writer(csvfile, delimiter=',')
    sign_table_writer.writerow([''] + [f for f in factor_names])
    for i in range(len(grid)):
        print([i+1] + ["+" if grid[i][j] == 1 else "-" for j in range(len(grid[i]))])
        sign_table_writer.writerow([i+1] + ["+" if grid[i][j] == 1 else "-" for j in range(len(grid[i]))])

[1, '-', '-', '-', '-']
[2, '+', '-', '-', '+']
[3, '-', '+', '-', '+']
[4, '+', '+', '-', '-']
[5, '-', '-', '+', '+']
[6, '+', '-', '+', '-']
[7, '-', '+', '+', '-']
[8, '+', '+', '+', '+']


In [5]:
arrays = [
    [x[0] for x in runs_levels],
    [x[1] for x in runs_levels],
    [x[2] for x in runs_levels],
    [x[3] for x in runs_levels],
]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=factor_names)
index

MultiIndex([(      'lazy',         'Cifar10',  '1 core', '1'),
            ('predictive',         'Cifar10',  '1 core', '4'),
            (      'lazy', 'FashionMNISTCNN',  '1 core', '4'),
            ('predictive', 'FashionMNISTCNN',  '1 core', '1'),
            (      'lazy',         'Cifar10', '4 cores', '4'),
            ('predictive',         'Cifar10', '4 cores', '1'),
            (      'lazy', 'FashionMNISTCNN', '4 cores', '1'),
            ('predictive', 'FashionMNISTCNN', '4 cores', '4')],
           names=['Approach', 'Model', 'CPU', 'Parallelism'])

In [6]:
dim_sizes = {'runs': len(runs_levels), 'repetitions': repetitions}
df = pd.DataFrame(
    np.abs(1 + np.random.randn(dim_sizes['runs'], dim_sizes['repetitions'])) * 60 * 4,
#     np.zeros((dim_sizes['runs'], dim_sizes['repetitions']))+1,
    index=index
)
df = np.log(df)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0,1,2,3,4
Approach,Model,CPU,Parallelism,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
lazy,Cifar10,1 core,1,4.942295,6.070852,6.071118,5.550686,6.100229
predictive,Cifar10,1 core,4,5.570533,5.639107,5.936661,5.185222,5.71978
lazy,FashionMNISTCNN,1 core,4,6.088249,4.815053,5.985127,5.31552,4.797903
predictive,FashionMNISTCNN,1 core,1,6.56524,5.985115,5.760877,5.776456,5.238786
lazy,Cifar10,4 cores,4,5.081202,5.422071,4.01945,5.632217,4.991143
predictive,Cifar10,4 cores,1,4.20154,5.92583,5.733765,5.602797,5.198242
lazy,FashionMNISTCNN,4 cores,1,4.499277,6.219171,5.831698,5.67657,6.71296
predictive,FashionMNISTCNN,4 cores,4,4.190059,6.740783,4.308092,6.244053,4.86707


In [7]:
q = np.zeros((2**(4-1)))  # +1 for a spot for q0
# now add column of 1's for the 'I' column
sign_table = np.array([ np.concatenate((np.array([1.]), x)) for x in sign])
q, sign_table

(array([0., 0., 0., 0., 0., 0., 0., 0.]),
 array([[ 1., -1., -1., -1., -1.,  1.,  1.,  1.],
        [ 1.,  1., -1., -1.,  1., -1., -1.,  1.],
        [ 1., -1.,  1., -1.,  1., -1.,  1., -1.],
        [ 1.,  1.,  1., -1., -1.,  1., -1., -1.],
        [ 1., -1., -1.,  1.,  1.,  1., -1., -1.],
        [ 1.,  1., -1.,  1., -1., -1.,  1., -1.],
        [ 1., -1.,  1.,  1., -1., -1., -1.,  1.],
        [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]]))

See box 18.1 for the following steps

$$q_j = \frac{1}{2^{k-1}} \sum_{i=1}^{k-1}S_{ij}\bar{y}_i$$

Where $k=4$ and $S_{ij}$ is the $(i,j)$ entry in the `sign_table`.

In [8]:
for j in range(len(q)):
    q[j] = (1/(2**(4-1))) * np.sum([sign_table[i][j] * df.iloc[i].mean() for i in range(2**(4-1))])
q

array([ 5.50531999,  0.01418044,  0.07558305, -0.15042048, -0.17785526,
       -0.02743021, -0.06785684,  0.09849078])

To calculate the sum of squares for each effect (as well as SS0/q0)

$$SSj = 2^{k-1}rq_j^2$$ for $j=0,1, 2,\dots,2^{k-1}-1$

In [9]:
SS = np.zeros((2**(4-1)))
for j in range(len(SS)):
    SS[j] = (2**(4-1)) * repetitions * (q[j] ** 2)
SS

array([1.21234193e+03, 8.04339996e-03, 2.28511908e-01, 9.05052825e-01,
       1.26529980e+00, 3.00966629e-02, 1.84182005e-01, 3.88017326e-01])

Now for $SSY$, $SST$, and $SSE$
$$SSY=\sum_{i=1}^{2^{k-1}}\sum_{j=1}^{r}y_{ij}^2$$
$$SST = SSY - SS0$$
$$SSE = SST - \sum_{j=1}^{2^{k-1}-1}SSj$$

In [10]:
SSY = np.sum([np.sum([df.iloc[i, j]**2 for j in range(repetitions)]) for i in range(2**(4-1))])
SST = SSY - SS[0]
SSE = SST - np.sum([SS[j] for j in range(1, 2**(4-1))])
SSY, SST, SSE

(1230.6243935838156, 18.282467070079065, 15.273263145720748)

% of `y`'s variation explained by `j`th effect
$$(SSj/SST) \cdot 100\%$$

In [11]:
var_expl = np.zeros((2**(4-1)))
for j in range(2**(4-1)):
    var_expl[j] = (SS[j] / SST) * 100
[f"{round(x, 2)}%" for x in var_expl[1:]]  # just to pretty-print

['0.04%', '1.25%', '4.95%', '6.92%', '0.16%', '1.01%', '2.12%']

In [12]:
(SSE/SST)*100

83.54049312478645

Standard deviation of errors:
$$s_e = \sqrt{\frac{SSE}{2^{k-1}(r-1)}}$$

In [13]:
s_e = np.sqrt(SSE/(2**(4-1)*(repetitions-1)))
s_e

0.6908613994889086

In [14]:
s_qj = np.array([s_e / np.sqrt(2**(4-1)*repetitions)] * (2**(4-1)))
s_qj

array([0.10923478, 0.10923478, 0.10923478, 0.10923478, 0.10923478,
       0.10923478, 0.10923478, 0.10923478])

## Confidence Intervals
To find the confidence interval for effect $i$ the following equation is used:
$$q_i \pm t_{[1-\alpha/2; 2^{k-1}(r-1)]}s_{qi}$$

In [15]:
from scipy import stats

In [16]:
conf_interval = np.zeros_like((q))[1:]  # remove q0
dof = 2**(4-1)*(repetitions-1)
for i in range(len(conf_interval)):
    interval = stats.t.cdf(q[i+1], dof) * s_qj[i+1]
    conf_interval[i] = interval
[(round(q[i+1] - interval, 2), round(q[i+1] + interval, 2)) for i, interval in enumerate(conf_interval)]

[(-0.04, 0.07),
 (0.02, 0.13),
 (-0.2, -0.1),
 (-0.22, -0.13),
 (-0.08, 0.03),
 (-0.12, -0.02),
 (0.04, 0.16)]

## ANOVA

See table 22.5

In [17]:
MSE = s_e**2
# all degrees of freedoms of factors are 1 (since all levels are just 2), so MS is the same as SS
# just have to slice [1:] to get rid of the SS0 term
MS = np.copy(SS[1:])

F_vals = MS/MSE

p_vals = np.zeros_like(F_vals)
error_degree_of_freedom = 2**(4-1)*(repetitions-1)

for i in range(len(F_vals)):   
    p_vals[i] = 1-stats.f.cdf(F_vals[i], 1, error_degree_of_freedom)

print(f"MSE = {round(MSE, 2)}, dof for error = {error_degree_of_freedom}")
ANOVA = pd.DataFrame(np.stack((MS, F_vals, p_vals)).T, columns=["Mean Square", "F Computed", "p-values"])
ANOVA

MSE = 0.48, dof for error = 32


Unnamed: 0,Mean Square,F Computed,p-values
0,0.008043,0.016852,0.897524
1,0.228512,0.47877,0.49397
2,0.905053,1.896235,0.178058
3,1.2653,2.651011,0.113291
4,0.030097,0.063057,0.803333
5,0.184182,0.385892,0.538867
6,0.388017,0.81296,0.373983


Output ANOVA table

In [18]:
factor_interaction_labels = ['A', 'B', 'C', 'D', 'AB', 'AC', 'BC']
rounding = 2
with open('anova.csv', 'w', newline='') as csvfile:
    anovawriter = csv.writer(csvfile, delimiter=',')
    anovawriter.writerow(['Component', 'Sum of Squares', 'Percentage of Variation', 'Degrees of Freedom', 'Mean Square', 'F', 'p-value'])
    for i in range(len(sign[0])):
        anovawriter.writerow([f"{factor_interaction_labels[i]}: {all_names[i]}",
                              round(SS[i+1], rounding),round(var_expl[i+1], rounding),
                              1, round(MS[i], rounding),
                              round(F_vals[i], rounding), round(p_vals[i], rounding)])
        
    anovawriter.writerow(['Error', round(SSE, rounding), round((SSE/SST)*100, rounding),
                          error_degree_of_freedom, round(MSE, rounding), '', ''])
    anovawriter.writerow(['Total', round(SST, rounding), 100, 2**(4-1)*repetitions,'','',''])