In [1]:
import pandas as pd
import researchpy as rp
import seaborn as sns
import numpy as np

import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp

In [2]:
# load data
measurement = [
    32, 48, 538, 52, 244, 998, 59, 536, 1348,
    53, 81, 1901, 112, 776, 3621, 121, 1879, 4639,
    142, 197, 5609, 262, 2625, 10012, 980, 5698, 12880,
    49, 67, 789, 79, 390, 1373, 85, 814, 1693,
    100, 134, 3152, 164, 1255, 4912, 206, 3394, 5838,
    233, 350, 9100, 458, 3688, 13531, 1633, 10022, 17117,
    62, 100, 1103, 111, 480, 1782, 111, 839, 2190,
    96, 245, 2807, 237, 1502, 6007, 286, 3092, 7654,
    265, 2012, 12429, 517, 4870, 18602, 1728, 8834, 23134
]

# log-transform 
measurement = np.log10(measurement)

# factors and levels
algorithm = ['lruv', 'fifo', 'rand']
program = ['small', 'medium', 'large']
arrangement = ['group', 'freqy', 'alpha']
memory = ['24p', '20p', '16p']

factor1 = np.tile(np.repeat(memory,1), [27,1])
factor1 = factor1.reshape(factor1.shape[0]*factor1.shape[1],)

factor2 = np.tile(np.repeat(arrangement,3), [9,1])
factor2 = factor2.reshape(factor2.shape[0]*factor2.shape[1],)

factor3 = np.tile(np.repeat(program,9), [3,1])
factor3 = factor3.reshape(factor3.shape[0]*factor3.shape[1],)

factor4 = np.tile(np.repeat(algorithm,27), [1,1])
factor4 = factor4.reshape(factor4.shape[0]*factor4.shape[1],)


# construct dataframe
df = pd.DataFrame({
    "memory": factor1,
    "arrangement": factor2,
    "program": factor3,
    "algorithm": factor4,
    "measurement": measurement,
})

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   memory       81 non-null     object 
 1   arrangement  81 non-null     object 
 2   program      81 non-null     object 
 3   algorithm    81 non-null     object 
 4   measurement  81 non-null     float64
dtypes: float64(1), object(4)
memory usage: 3.3+ KB


In [4]:
df[:10]

Unnamed: 0,memory,arrangement,program,algorithm,measurement
0,24p,group,small,lruv,1.50515
1,20p,group,small,lruv,1.681241
2,16p,group,small,lruv,2.730782
3,24p,freqy,small,lruv,1.716003
4,20p,freqy,small,lruv,2.38739
5,16p,freqy,small,lruv,2.999131
6,24p,alpha,small,lruv,1.770852
7,20p,alpha,small,lruv,2.729165
8,16p,alpha,small,lruv,3.12969
9,24p,group,medium,lruv,1.724276


In [5]:
rp.summary_cat(df[["memory", "arrangement", "program", "algorithm"]])

Unnamed: 0,Variable,Outcome,Count,Percent
0,memory,20p,27,33.33
1,,16p,27,33.33
2,,24p,27,33.33
3,arrangement,freqy,27,33.33
4,,group,27,33.33
5,,alpha,27,33.33
6,program,medium,27,33.33
7,,small,27,33.33
8,,large,27,33.33
9,algorithm,fifo,27,33.33


In [6]:
rp.summary_cont(df["measurement"])





Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,measurement,81.0,2.903,0.7794,0.0866,2.7306,3.0753


In [7]:
one = "C(memory) + C(arrangement) + C(program) + C(algorithm) +"
two_1 = "C(memory) * C(arrangement) + C(memory) * C(program) + C(memory) * C(algorithm) + " 
two_2 = "C(arrangement) * C(program) + C(arrangement) * C(algorithm) + "
two_3 = "C(program) *  C(algorithm) + "
two = two_1 + two_2 + two_3
three_1 = "C(memory) * C(arrangement) * C(program) + "
three_2 = "C(memory) * C(arrangement) * C(algorithm) + " 
three_3 = "C(memory) * C(program) * C(algorithm) + "
three_4 = "C(arrangement) * C(program) * C(algorithm)"
three = three_1 + three_2 + three_3 + three_4
formula = "measurement ~ " +one + two + three

In [8]:
model = ols(formula, df).fit()

In [9]:
model.summary()

0,1,2,3
Dep. Variable:,measurement,R-squared:,0.999
Model:,OLS,Adj. R-squared:,0.993
Method:,Least Squares,F-statistic:,179.6
Date:,"Sun, 22 Nov 2020",Prob (F-statistic):,8.25e-16
Time:,21:44:23,Log-Likelihood:,172.19
No. Observations:,81,AIC:,-214.4
Df Residuals:,16,BIC:,-58.73
Df Model:,64,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.2474,0.058,72.975,0.000,4.124,4.371
C(memory)[T.20p],-0.2750,0.077,-3.568,0.003,-0.438,-0.112
C(memory)[T.24p],-1.0199,0.077,-13.232,0.000,-1.183,-0.857
C(arrangement)[T.freqy],-0.1038,0.077,-1.346,0.197,-0.267,0.060
C(arrangement)[T.group],-0.3147,0.077,-4.083,0.001,-0.478,-0.151
C(program)[T.medium],-0.4748,0.077,-6.160,0.000,-0.638,-0.311
C(program)[T.small],-1.0391,0.077,-13.481,0.000,-1.203,-0.876
C(algorithm)[T.lruv],-0.1214,0.077,-1.575,0.135,-0.285,0.042
C(algorithm)[T.rand],0.0867,0.077,1.125,0.277,-0.077,0.250

0,1,2,3
Omnibus:,6.021,Durbin-Watson:,3.325
Prob(Omnibus):,0.049,Jarque-Bera (JB):,9.608
Skew:,0.057,Prob(JB):,0.00819
Kurtosis:,4.683,Cond. No.,72.3


In [10]:
res = sm.stats.anova_lm(model, typ = 3)
res

Unnamed: 0,sum_sq,df,F,PR(>F)
Intercept,22.481423,1.0,5325.316967,1.2748249999999999e-21
C(memory),0.791398,2.0,93.731778,1.462402e-09
C(arrangement),0.073088,2.0,8.656386,0.002831873
C(program),0.769138,2.0,91.095321,1.804242e-09
C(algorithm),0.03107,2.0,3.679932,0.04843925
C(memory):C(arrangement),0.633938,4.0,37.541203,6.081775e-08
C(memory):C(program),0.128265,4.0,7.595717,0.001251148
C(memory):C(algorithm),0.009543,4.0,0.565137,0.6914797
C(arrangement):C(program),0.000937,4.0,0.055489,0.9936778
C(arrangement):C(algorithm),0.02577,4.0,1.526069,0.2418549
