In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, ShuffleSplit
import numpy as np
from itertools import combinations
import statsmodels.api as sm
import statsmodels.tools as st
from statsmodels.stats.outliers_influence import variance_inflation_factor
import time
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def import_data(experimental_file, response_file, response):
    'imports data'
    'experimental_file = experimental design csv filename i.e experimental.csv'
    'response_file = results csv filename i.e Response.csv'
    'response = reponse name: i.e rheomix final deg time min or rheomix stability time min'
    
    experimental_df = pd.read_csv(experimental_file)
    response_df = pd.read_csv(response_file)
    
    X = experimental_df[experimental_df.columns.values.tolist()[1:]].values
    y = response_df[response].values
    max1 = max(y)
    min1 = min(y)

    y_norm = [2*((i-min1)/(max1-min1)) - 1 for i in y]
    
    
    X_linear = X
    linear_terms = experimental_df.columns.values.tolist()[1:]
    
    return y_norm, X_linear, linear_terms, experimental_df, response_df
    

In [3]:
def X_gen(model, X_linear):

    for i, j in enumerate(model):
        
        if i == 0 and len(j) == 2: 
            X_new = X_linear[:, model[0][1]]
        
        else:
            
            if len(j) == 2:
                add = X_linear[:, j[1]]

            if len(j) == 3:
                add = X_linear[:, j[1]]*X_linear[:, j[2]]

            X_new2 = np.column_stack((X_new, add))
            X_new = X_new2

    
    return X_new

In [4]:
start = time.time()


# linear_terms = experimental_df.columns.values.tolist()[1:]



def fit_model(experimental_file, response_file, response, k_folds):
    

    total_subset = []

    
    y, X_linear, linear_terms, experimental_df, response_df = import_data(experimental_file, response_file, response)
    y1 = np.array(y)
    lin_terms = []
    AIC_prev = 1000
    av_score_prev = 0

    for i in range(len(linear_terms)):
        term = linear_terms[i]
        key = i
        lin_terms.append([term, i])

    cnt = 0
    for i in range(len(lin_terms)):
        for j in combinations(lin_terms, i+1):
            linear_terms = list(j)


            model = [a for a in linear_terms]
            cnt+=1

            X = X_gen(model, X_linear)

            model_fit = sm.OLS(y, X)
            results = model_fit.fit()
            AIC_cur = st.eval_measures.aicc(results.llf, results.nobs, results.df_model)
            AICc = AIC_cur
            
            if len(model) == 1:
                
                X1 = X.reshape(len(X), 1)
                model_obj = LinearRegression(fit_intercept=False)
                my_cv = ShuffleSplit(n_splits=k_folds, test_size= 1/k_folds, random_state=0)

                score = cross_val_score(model_obj, X1, y1, cv=my_cv)
                av_score = sum(score)/len(score)
            
            else:
            
                model_obj = LinearRegression(fit_intercept=False)
                my_cv = ShuffleSplit(n_splits= k_folds, test_size= 1/k_folds, random_state=0)

                score = cross_val_score(model_obj, X, y1, cv=my_cv)
                av_score = sum(score)/len(score)
            
            model_name = '--'
            for i in model:
                
                model_name += i[0] + "--"

            dictionary = {'AIC': results.aic, 'AICc': AICc, 'BIC': results.bic, 'Cond_No': results.condition_number, 'Model': model_name,
                          'r2': results.rsquared, 'No_terms': len(model), 'Kfold': av_score}
            total_subset.append(dictionary)
            
                    
                    
            if AIC_cur < AIC_prev:

                AIC_prev = AIC_cur
                final_model = [model, results, AIC_cur]
                
            if av_score_prev < av_score:

                av_score_prev= av_score
                final_model_kfold = [model, results, av_score]                
    



            poss_terms = []
            for i in range(len(linear_terms)):
                for j in range(len(linear_terms)): 
                    if i < j:
                        poss_terms.append([linear_terms[i][0] + '*' + linear_terms[j][0],  linear_terms[i][1], linear_terms[j][1]])



            for m in range(1, len(poss_terms) + 1):
                for k in combinations(poss_terms, m):

                    model = [a for a in linear_terms]
                    for i in range(m):
                        model.append(k[i])

                    cnt +=1
                    X = X_gen(model, X_linear)

                    model_fit = sm.OLS(y, X)
                    results = model_fit.fit()
                    AIC_cur = st.eval_measures.aicc(results.llf, results.nobs, results.df_model)
                    AICc = AIC_cur
                    
                    model_obj = LinearRegression(fit_intercept=False)
                    my_cv = ShuffleSplit(k_folds, test_size= 1/k_folds, random_state=0)
                    
                    score = cross_val_score(model_obj, X, y1, cv=my_cv)
                    av_score = sum(score)/len(score)
                    
                    model_name = '--'
                    for i in model:

                        model_name += i[0] + "--"
                    
                    dictionary = {'AIC': results.aic, 'AICc': AICc, 'BIC': results.bic, 'Cond_No': results.condition_number, 'Model': model_name,
                                  'r2': results.rsquared, 'No_terms': len(model), 'Kfold': sum(score)/len(score)}
                    
                    total_subset.append(dictionary)



                    if AIC_cur < AIC_prev:
                        
                        AIC_prev = AIC_cur
                        final_model = [model, results, AIC_cur]
                
                    if av_score_prev < av_score:

                        av_score_prev= av_score
                        final_model_kfold = [model, results, av_score]                
    



    return final_model, total_subset, final_model_kfold


In [5]:
experimental_file = 'experimental.csv'
response_file = 'Response.csv'


test1 = 'rheomix final deg time min'
test2 = 'rheomix stability time min'
response = test1
k_folds = 10


final_model, total_subset, final_model_kfold = fit_model(experimental_file, response_file, response, k_folds)

In [6]:
df = pd.DataFrame(total_subset)
df.to_csv("FINAL_K=" + str(k_folds) '_Total_model_subset_'  + response + ".csv")


In [7]:
final_model, final_model[1].summary(), final_model_kfold, final_model_kfold[1].summary()

([[['Xpvc', 0],
   ['Xfiller', 1],
   ['Xstabiliser', 3],
   ['Xdinp', 4],
   ['Xldh', 5],
   ['Xstabiliser*Xldh', 3, 5],
   ['Xdinp*Xldh', 4, 5]],
  <statsmodels.regression.linear_model.RegressionResultsWrapper at 0xa14c9b5dd8>,
  16.53370090020981],
 <class 'statsmodels.iolib.summary.Summary'>
 """
                                  OLS Regression Results                                
 Dep. Variable:                      y   R-squared (uncentered):                   0.886
 Model:                            OLS   Adj. R-squared (uncentered):              0.866
 Method:                 Least Squares   F-statistic:                              43.51
 Date:                Sun, 17 Nov 2019   Prob (F-statistic):                    1.76e-16
 Time:                        07:18:37   Log-Likelihood:                         0.20683
 No. Observations:                  46   AIC:                                      13.59
 Df Residuals:                      39   BIC:                              

In [8]:
# plt.plot(CN_hist, AIC_hist, 'r*')
# plt.axis([0, 2000, 12, 20])
# plt.xlabel('CN')
# plt.ylabel('AIC')
# plt.show()

In [9]:
# plt.plot(VIF_max_hist, AIC_hist1, 'r*')
# plt.show()

In [10]:
# plt.scatter(CN_hist1, AIC_hist1, c = VIF_max_hist, vmin=0, vmax=50)
# # plt.axis([0, 1000, 12, 20])
# plt.xlabel('CN')
# plt.ylabel('AIC')
# plt.colorbar()
# # plt.set_label('VIF')
# plt.show()

In [11]:
# from mpl_toolkits.mplot3d import Axes3D

# fig = plt.figure()
# ax = Axes3D(fig)

# ax.scatter(CN_hist1,VIF_max_hist, AIC_hist1)
# plt.xlabel('CN')
# plt.ylabel('VIF')
# ax.set_zlabel('AIC')
# # plt.axis([0, 1000, 12, 18])
# # plt.colorbar()
# plt.show()

In [12]:
# len(model_subset)

In [13]:
# for i in model_subset:
#     if i[2] < 550 and i[2] > 450:
#         a = []
#         for j in i[0]:
#             a.append(j[0])
#         print('----model:', a, 'AIC', round(i[1], 3), 'CN:', round(i[2], 3), 'vif_max:', round(i[3], 3), 'R2:', round(i[4], 3), '----'
#              )
        

# 