In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from itertools import combinations
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import time
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def import_data(experimental_file, response_file, response):
    'imports data'
    'experimental_file = experimental design csv filename i.e experimental.csv'
    'response_file = results csv filename i.e Response.csv'
    'response = reponse name: i.e rheomix final deg time min or rheomix stability time min'
    
    experimental_df = pd.read_csv(experimental_file)
    response_df = pd.read_csv(response_file)
    
    X = experimental_df[experimental_df.columns.values.tolist()[1:]].values
    y = response_df[response].values
    max1 = max(y)
    min1 = min(y)

    y_norm = [2*((i-min1)/(max1-min1)) - 1 for i in y]
    
    
    X_linear = X
    linear_terms = experimental_df.columns.values.tolist()[1:]
    
    return y_norm, X_linear, linear_terms, experimental_df, response_df
    

In [3]:
def X_gen(model, X_linear):

    for i, j in enumerate(model):
        
        if i == 0 and len(j) == 2: 
            X_new = X_linear[:, model[0][1]]
        
        else:
            
            if len(j) == 2:
                add = X_linear[:, j[1]]

            if len(j) == 3:
                add = X_linear[:, j[1]]*X_linear[:, j[2]]

            X_new2 = np.column_stack((X_new, add))
            X_new = X_new2

    
    return X_new

In [4]:
start = time.time()


# linear_terms = experimental_df.columns.values.tolist()[1:]



def fit_model(experimental_file, response_file, response, AIC_lim, CN_lim):
    

    total_subset = []
    model_subset = []
    
    y, X_linear, linear_terms, experimental_df, response_df = import_data(experimental_file, response_file, response)
    lin_terms = []
    AIC_prev = 1000

    for i in range(len(linear_terms)):
        term = linear_terms[i]
        key = i
        lin_terms.append([term, i])

    cnt = 0
    for i in range(len(lin_terms)):
        for j in combinations(lin_terms, i+1):
            linear_terms = list(j)


            model = [a for a in linear_terms]
            cnt+=1

            X = X_gen(model, X_linear)

            model_fit = sm.OLS(y, X)
            results = model_fit.fit()
            AIC_cur = results.aic           
            
            model_name = '--'
            for i in model:
                
                model_name += i[0] + "--"

            dictionary = {'AIC': results.aic, 'Cond_No': results.condition_number, 'Model': model_name,
                          'r2': results.rsquared, 'No_terms': len(model)}
            total_subset.append(dictionary)
            
            if AIC_cur < AIC_lim:
                if results.condition_number < CN_lim:
                    
                    variables = model_fit.exog
                    
                    if len(model) != 1:

                        vif = [variance_inflation_factor(variables, i) for i in range(variables.shape[1])]
                        vif_max = max(vif)

                    else:
                        vif_max = 0


                    if max(vif) < 40:
                        dictionary = {'AIC': results.aic, 'Cond_No': results.condition_number, 'Model': model,
                        'r2': results.rsquared, 'No_terms': len(model), 'VIF': vif_max}
                        
                        model_subset.append(dictionary)

                    
                    
            if AIC_cur < AIC_prev:

                AIC_prev = AIC_cur
                final_model = [model, results, AIC_cur]
                




            poss_terms = []
            for i in range(len(linear_terms)):
                for j in range(len(linear_terms)): 
                    if i < j:
                        poss_terms.append([linear_terms[i][0] + '*' + linear_terms[j][0],  linear_terms[i][1], linear_terms[j][1]])



            for m in range(1, len(poss_terms) + 1):
                for k in combinations(poss_terms, m):

                    model = [a for a in linear_terms]
                    for i in range(m):
                        model.append(k[i])

                    cnt +=1
                    X = X_gen(model, X_linear)

                    model_fit = sm.OLS(y, X)
                    results = model_fit.fit()
                    AIC_cur = results.aic
                    
                    model_name = '--'
                    for i in model:

                        model_name += i[0] + "--"
                    
                    dictionary = {'AIC': results.aic, 'Cond_No': results.condition_number, 'Model': model_name,
                                  'r2': results.rsquared, 'No_terms': len(model)}
                    
                    total_subset.append(dictionary)



                    if AIC_cur < AIC_lim:
                        if results.condition_number < CN_lim:

                            variables = model_fit.exog
                            
                            if len(model) != 1:

                                vif = [variance_inflation_factor(variables, i) for i in range(variables.shape[1])]
                                vif_max = max(vif)

                            else:
                                vif_max = 0


                            if max(vif) < 40:
                                dictionary = {'AIC': results.aic, 'Cond_No': results.condition_number, 'Model': model,
                                'r2': results.rsquared, 'No_terms': len(model), 'VIF': vif_max}
                                
                                model_subset.append(dictionary)


                    if AIC_cur < AIC_prev:
                        
                        AIC_prev = AIC_cur
                        final_model = [model, results, AIC_cur]




    return final_model, total_subset, model_subset


In [5]:
experimental_file = 'experimental.csv'
response_file = 'Response.csv'


test1 = 'rheomix final deg time min'
test2 = 'rheomix stability time min'
response = test1

AIC_lim = 20
CN_lim = 600


final_model, total_subset, model_subset = fit_model(experimental_file, response_file, response, AIC_lim, CN_lim)

In [6]:
df = pd.DataFrame(total_subset)
df.to_csv("Toatal_model_subset_"  + response + ".csv")

dff = pd.DataFrame(model_subset)
dff.to_csv("limited_model_subset_"  + response + ".csv")

In [7]:
# plt.plot(CN_hist, AIC_hist, 'r*')
# plt.axis([0, 2000, 12, 20])
# plt.xlabel('CN')
# plt.ylabel('AIC')
# plt.show()

In [8]:
# plt.plot(VIF_max_hist, AIC_hist1, 'r*')
# plt.show()

In [9]:
# plt.scatter(CN_hist1, AIC_hist1, c = VIF_max_hist, vmin=0, vmax=50)
# # plt.axis([0, 1000, 12, 20])
# plt.xlabel('CN')
# plt.ylabel('AIC')
# plt.colorbar()
# # plt.set_label('VIF')
# plt.show()

In [10]:
# from mpl_toolkits.mplot3d import Axes3D

# fig = plt.figure()
# ax = Axes3D(fig)

# ax.scatter(CN_hist1,VIF_max_hist, AIC_hist1)
# plt.xlabel('CN')
# plt.ylabel('VIF')
# ax.set_zlabel('AIC')
# # plt.axis([0, 1000, 12, 18])
# # plt.colorbar()
# plt.show()

In [11]:
# len(model_subset)

In [12]:
# for i in model_subset:
#     if i[2] < 550 and i[2] > 450:
#         a = []
#         for j in i[0]:
#             a.append(j[0])
#         print('----model:', a, 'AIC', round(i[1], 3), 'CN:', round(i[2], 3), 'vif_max:', round(i[3], 3), 'R2:', round(i[4], 3), '----'
#              )
        