In [13]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from itertools import combinations
import statsmodels.api as sm
import statsmodels.tools as st
from statsmodels.stats.outliers_influence import variance_inflation_factor
import copy

In [14]:
def import_data(experimental_file, response_file, response):
    'imports data'
    'experimental_file = experimental design csv filename i.e experimental.csv'
    'response_file = results csv filename i.e Response.csv'
    'response = reponse name: i.e rheomix final deg time min or rheomix stability time min'
    
    experimental_df = pd.read_csv(experimental_file)
    response_df = pd.read_csv(response_file)
    
    X = experimental_df[experimental_df.columns.values.tolist()[1:]].values
    y = response_df[response].values
    max1 = max(y)
    min1 = min(y)

    y_norm = [2*((i-min1)/(max1-min1)) - 1 for i in y]
    
    
    X_linear = X
    
    return X, y_norm, X_linear, experimental_df, response_df
    

In [15]:
def linear_fit(y, X_linear):
    'fits model of all linear terms to obtain benchmark AIC'
    'AIC_prev_min = AIC value of previous step in stepwise regression'
    'AIC_cur_min = AIC value of current step in stepwise regression'
    
    model = sm.OLS(y, X_linear)
    results = model.fit()
    AIC_prev_min = st.eval_measures.aicc(results.llf, results.nobs, results.df_model) + 1
    AIC_cur_min = st.eval_measures.aicc(results.llf, results.nobs, results.df_model)
    
    return AIC_prev_min, AIC_cur_min

In [16]:
def model_type_func(array_, indexs):
    
    if len(indexs) == 2:
        
        if indexs[1]== 'inv' or indexs[1]== 'log':
            
            if indexs[1] == 'inv':
                return 1/(array_[:, indexs[0]])
            if indexs[1] == 'log':
                return np.log(array_[:, indexs[0]])
            
        else:
            return array_[:, indexs[0]]*array_[:, indexs[1]]

    
def model_terms_name(list_, terms, indexs):
    
    if len(terms) == 1:
        
        if indexs[1] == 'inv':
            list_.append(['1' + '/' + terms[0],  indexs[0], indexs[1]])
        if indexs[1] == 'log':
            list_.append(['log' + '(' + terms[0] + ')',  indexs[0], indexs[1]])

    
    if len(terms) == 2:


        list_.append([terms[0] + '*' + terms[1],  indexs[0], indexs[1]])

In [17]:
def model_terms_list(experimental_df, response_df, inv_log, linear_terms):
    'creates list of terms with key in current model'
    'creates list of possible terms with key to be added'
    
    
    model_terms = []
    for i in range(len(linear_terms)):

        term = linear_terms[i]
        key = i
        model_terms.append([term, i])
     
    poss_terms = []
    for i in range(len(linear_terms)):
        for j in range(len(linear_terms)): 
            if i < j:
                
                model_terms_name(poss_terms, [linear_terms[i], linear_terms[j]], [i, j])
       
    
    if inv_log == 'log' or inv_log == 'inv':

        for i in range(len(linear_terms)):
            model_terms_name(poss_terms, [linear_terms[i]], [i, inv_log])

        
    return model_terms, poss_terms

In [18]:
def X_gen(model, X_linear):

    for i, j in enumerate(model):
        
        if i == 0 and len(j) == 2: 
            X_new = X_linear[:, model[0][1]]
        
        else:
            
            if len(j) == 2:
                add = X_linear[:, j[1]]

            if len(j) == 3:
                add = X_linear[:, j[1]]*X_linear[:, j[2]]

            X_new2 = np.column_stack((X_new, add))
            X_new = X_new2

    
    return X_new

In [28]:
def model_fit(experimental_file, response_file, response, inv_log, linear_terms, model_terms1, cond_limit, VIF_limit):

    X1, y, X_linear1, experimental_df, response_df = import_data(experimental_file, response_file, response)
    X = X_gen(model_terms1, X_linear1)
    X_linear = X
    
    model_terms, poss_terms = model_terms_list(experimental_df, response_df, inv_log, linear_terms)

    AIC_prev_min, AIC_cur_min = linear_fit(y, X_linear)

    cntt = 0
    cnt_start = 0
    while AIC_cur_min < AIC_prev_min:

        cntt += 1

        AIC_prev_min = AIC_cur_min

        cnt1 = 0
        
        for i in poss_terms:

            if i[-1] == 'log' or i[-1] == 'inv':
                hierachy_compl = 1

            else:
                TEST_MODEL = copy.deepcopy(model_terms)
                for index_x, x in enumerate(TEST_MODEL):
                    if x[-1] == 'log' or x[-1] == 'inv':
                        del TEST_MODEL[index_x]

                for x in TEST_MODEL:
                    del x[0]
                    x.sort()
                TEST_TERM = i[1:]
                TEST_TERMS = []
                for k in range(len(TEST_TERM)-1):
                    for l in combinations(TEST_TERM, k+1):
                        l2 = list(l)
                        l2.sort()
                        TEST_TERMS.append(l2)

                hierachy_compl = 0
                if all(item in TEST_MODEL for item in TEST_TERMS):
                    hierachy_compl = 1


            if hierachy_compl == 1:


                if len(i) == 2:

                    j = i[1]
                    add_term_cur = X_linear[:, j]

                else:
                    add_term_cur = model_type_func(X_linear, i[1:])

                X_new = np.column_stack((X, add_term_cur))
                new_model = sm.OLS(y, X_new)
                new_results = new_model.fit()
                AIC = st.eval_measures.aicc(new_results.llf, new_results.nobs, new_results.df_model)
                COND = new_results.condition_number

                variables = new_model.exog
                vif = [variance_inflation_factor(variables, m) for m in range(variables.shape[1])]
                vif_max = max(vif)


                if AIC < AIC_cur_min and COND < cond_limit and vif_max < VIF_limit:

                    AIC_cur_min = AIC
                    X_updated = X_new
                    term_key = i 
                    results = new_results
                    cnt1 = 1
                    vif2 = vif_max

        if AIC_cur_min < AIC_prev_min and cnt1 == 1:

            model_terms.append(term_key)
            X = X_updated
            poss_terms.remove(term_key)
            final = results
            cnt_start = 1
            vif_max1 = vif2

        

        cnt2 = 0
        for i, j in enumerate(model_terms):
            if len(model_terms) > 1:
                if j[-1] == 'log' or j[-1] == 'inv':
                    hierachy_max = 1

                else:

                    hierachy_max = 1
                    for k in model_terms:
                        if len(k) > len(j):
                            if any(x in j for x in k):
                                hierachy_max = 0

                if hierachy_max == 1:

                    X_new = np.delete(X, i, axis = 1)


                    new_model = sm.OLS(y, X_new)
                    new_results = new_model.fit()
                    AIC = st.eval_measures.aicc(new_results.llf, new_results.nobs, new_results.df_model)
                    COND = new_results.condition_number
                    if len(model_terms) > 2:
                        variables = new_model.exog
                        vif = [variance_inflation_factor(variables, m) for m in range(variables.shape[1])]
                        vif_max = max(vif)
                    else: 
                        vif_max = 0



                    if AIC < AIC_cur_min:

                        AIC_cur_min = AIC
                        X_updated = X_new
                        term_key = j

                        sol_results = new_results
                        cnt2 = 1
                        vif2 = vif_max


        if AIC_cur_min < AIC_prev_min and cnt2 == 1:


            model_terms.remove(term_key)
            X = X_updated
            poss_terms.append(term_key)
            final = sol_results
            cnt_start = 1
            vif_max1 = vif2
            
    if cnt_start == 0:
        new_model = sm.OLS(y, X_linear)
        final = new_model.fit()
        AIC_cur_min = st.eval_measures.aicc(final.llf, final.nobs, final.df_model)
        if len(model_terms) != 1:
            variables = new_model.exog
            vif = [variance_inflation_factor(variables, m) for m in range(variables.shape[1])]
            vif_max1 = max(vif)
        else: 
            vif_max1 = 0

        
    AICc_final = AIC_cur_min
    return AICc_final, final, model_terms, X, y, poss_terms, vif_max1

In [29]:
def model_iteration(experimental_file, response_file, response, inv_log, cond_limit, VIF_limit):

    experimental_df = pd.read_csv(experimental_file)
    linear_terms = experimental_df.columns.values.tolist()[1:]
    
    lin_terms = []
    for i in range(len(linear_terms)):
        term = linear_terms[i]
        key = i
        lin_terms.append(term)
        
    AICc_prev = 1000
    cnt = 0
    for i in range(len(lin_terms)):
        for j in combinations(lin_terms, i+1):
            linear_terms = list(j)
            model_terms1 = []
            for k in linear_terms:
                key1 = lin_terms.index(k)
                model_terms1.append([k, key1])
                
            AICc, final1, terms, X, y, poss, VIF = model_fit(experimental_file, response_file, response, inv_log, linear_terms, model_terms1, cond_limit, VIF_limit)
            
            if AICc < AICc_prev and final1.condition_number < cond_limit and VIF < VIF_limit:

                AICc_prev = AICc
                AICc_final, final1_final, terms_final, X_final, y_final, poss_final = AICc, final1, terms, X, y, poss
                
    return AICc_final, final1_final, terms_final, X_final, y_final, poss_final

In [35]:
%%time

'model_fit(experimental_file, response_file, response, inv_log, cond_limit, VIF_limit)'

'repsonse = rheomix final deg time min or rheomix stability time min'
test1 = 'rheomix final deg time min'
test2 = 'rheomix stability time min'
cond_limit = 1500 #conditioning number limit
VIF_limit = 100 #VIF limit

'inv_log = inv, log or None to add inverse terms, log terms or neither'

AICc, final, terms, X, y, poss = model_iteration('experimental.csv', 'Response.csv', test1,  None, cond_limit, VIF_limit)
AICc, final.summary(), terms, final.params, poss

Wall time: 4.25 s


(16.53370090020981,
 <class 'statsmodels.iolib.summary.Summary'>
 """
                                  OLS Regression Results                                
 Dep. Variable:                      y   R-squared (uncentered):                   0.886
 Model:                            OLS   Adj. R-squared (uncentered):              0.866
 Method:                 Least Squares   F-statistic:                              43.51
 Date:                Sun, 24 Oct 2021   Prob (F-statistic):                    1.76e-16
 Time:                        17:49:47   Log-Likelihood:                         0.20683
 No. Observations:                  46   AIC:                                      13.59
 Df Residuals:                      39   BIC:                                      26.39
 Df Model:                           7                                                  
 Covariance Type:            nonrobust                                                  
                  coef    std err       