In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#model
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

from sklearn.model_selection import KFold
from sklearn.inspection import permutation_importance

from sklearn import svm
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.augmentations import RegressionSMOTE
from sklearn.neural_network import MLPRegressor
from lce import LCERegressor

plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rc('axes', unicode_minus=False)

In [None]:
def splitlabel(data,input,output):
    X = data[:,input]
    y = data[:,output]
    return X,y

def average_choose_data(data_x,data_y,dividearr,divide_n,aver_num,seed=None):
    datanew_x,datanew_y = np.zeros((0,data_x.shape[1])),np.zeros(0)
    for ii in range(divide_n):
        low,big = dividearr[ii],dividearr[ii+1]
        cond = np.where(np.logical_and(data_y>=low,data_y<=big))[0]
        tmpx,tmpy = data_x[cond,:],data_y[cond]
        if seed != None:
            np.random.seed(seed)
        verifyidx = np.random.choice(range(tmpy.shape[0]), size=aver_num, replace=False)
        datanew_x = np.vstack((datanew_x,tmpx[verifyidx,:]))
        datanew_y = np.hstack((datanew_y,tmpy[verifyidx]))

    return datanew_x,datanew_y

def raw2process(datax):
    datanew = np.zeros((datax.shape[0],6))
    datanew[:,0] = datax[:,1] # 7v
    datanew[:,1] = datax[:,5] # 19V
    datanew[:,2] = datax[:,-1] # 37v
    datanew[:,3] = (datax[:,-1]-datax[:,5])/(datax[:,-1]+datax[:,5]) #gr 37v/19v
    datanew[:,4] = (datax[:,5]-datax[:,1])/(datax[:,5]+datax[:,1]) #gr 19v/7v
    datanew[:,5] = (datax[:,-1]-datax[:,-2])/(datax[:,-1]+datax[:,-2]) #pr37
    return datanew

def normal(data_xraw,data_yraw):
    data_x = np.zeros_like(data_xraw)
    minmax = np.zeros((2,data_x.shape[1]+1))
    for jj in range(data_x.shape[1]):
        minmax[0,jj],minmax[1,jj] = np.min(data_xraw[:,jj]),np.max(data_xraw[:,jj])
        data_x[:,jj] = 2*(data_xraw[:,jj]-minmax[0,jj])/(minmax[1,jj]-minmax[0,jj])-1

    minmax[0,-1],minmax[1,-1] = np.min(data_yraw),np.max(data_yraw)
    data_y = 2*(data_yraw-minmax[0,-1])/(minmax[1,-1]-minmax[0,-1])-1

    return data_x, data_y, minmax

def normal_out_func(pre,normal_out):
    return (pre+1)/2*(normal_out[1]-normal_out[0])+normal_out[0]
def get_predict(model,datan,normal_out):
    pre_1 = model.predict(datan)
    pre = normal_out_func(pre_1,normal_out)
    return pre
def process_predict(data_s,normal_in,normal_out,model):
    data_s2 = 2*(data_s-normal_in[0,:])/(normal_in[1,:]-normal_in[0,:])-1
    pre3_1 = get_predict(model,data_s2,normal_out)
    return  pre3_1

class normal_model:
    def __init__(self,model,istabnet=False):
        self.model = model
        self.istabnet = istabnet

    def fit(self,data_xraw,data_yraw):
        self.X_train,self.y_train,self.minmax = None, None, None
        self.X_train,self.y_train,self.minmax = normal(data_xraw,data_yraw)

        if self.istabnet:
            self.model.fit(self.X_train,self.y_train.reshape(-1, 1),
                   max_epochs=100,
                    patience=20,
                    batch_size=24, virtual_batch_size=12,
                    num_workers=0,
                    drop_last=False,
                    augmentations=RegressionSMOTE(p=0.2), #aug
                   )

        else:
            self.model.fit(self.X_train,self.y_train)

    def predict(self,X_test):
        self.X_test = X_test
        normalize_in = self.minmax[:,:-1]
        normalize_out = self.minmax[:,-1]
        self.y_pre = process_predict(self.X_test,normalize_in,normalize_out,self.model)
        if self.istabnet:
            self.y_pre = self.y_pre.reshape(-1)
        return self.y_pre

    def myscore(self,X_test,y_test,isshow=False):
        y_test_pred = self.predict(X_test)
        MSEtest,R2test = mean_squared_error(y_test,y_test_pred),r2_score(y_test,y_test_pred)
        MAEtest = mean_absolute_error(y_test,y_test_pred)
        biastest = np.mean(y_test_pred-y_test)
        if isshow:
            print('test-RMSE ',np.sqrt(MSEtest),' r2 ',R2test,'MAE',MAEtest,'bias',biastest)
        return np.array([np.sqrt(MSEtest),R2test,MAEtest,biastest])

    def importance(self,data_xraw,data_yraw):
        self.X_train,self.y_train,self.minmax = None, None, None
        self.X_train,self.y_train,self.minmax = normal(data_xraw,data_yraw)
        if self.istabnet:
            self.y_train = self.y_train.reshape(-1, 1)

        if self.istabnet:
            self.model.fit(self.X_train,self.y_train.reshape(-1, 1),
                   max_epochs=100,
                    patience=20,
                    batch_size=24, virtual_batch_size=12,
                    num_workers=0,
                    drop_last=False,
                    augmentations=RegressionSMOTE(p=0.2), #aug
                   )

        else:
            self.model.fit(self.X_train,self.y_train)
        result = permutation_importance(self.model, self.X_train, self.y_train, n_repeats=5,scoring='neg_mean_squared_error')

        importances = result.importances_mean
        return importances

def split_data_into_folds(data, labels, n_splits=5, shuffle=True, random_state=None):
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    folds = []
    for train_index, test_index in kf.split(data):
        train_data, test_data = data[train_index], data[test_index]
        train_labels, test_labels = labels[train_index], labels[test_index]
        folds.append((train_data, train_labels, test_data, test_labels))
    return folds

In [None]:
data0 = np.array(pd.read_csv(None))

data_x11,data_yraw1 = splitlabel(data0,range(2,12),1)

In [None]:
'''
K-fold

You need to create a folder named 'K_fold'
'''

num_epoch = 100
num_fold = 5
colomn = ['knn-u','et','tab','gb','knn-d','svm','xgb','rf','lgb','cb','adb','lce','mlp']


for epoch in range(num_epoch):

    data_x1,data_yraw = average_choose_data(data_x11,data_yraw1,
                                              [ 0.056, 0.123 , 0.1896, 0.2561, 0.3895],
                                              4,36,np.random.random_integers(0,100000,1)[0]) # you can change the seed to get fixed result
    data_xraw = raw2process(data_x1)


    folds = split_data_into_folds(data_xraw, data_yraw, n_splits=num_fold, shuffle=True, random_state=np.random.random_integers(0,100000,1)[0])

    result = np.zeros((13,num_fold,4))

    for i, (train_data, train_labels, test_data, test_labels) in enumerate(folds):
        model1 = normal_model(KNeighborsRegressor(weights='uniform'))
        model2 = normal_model(ExtraTreesRegressor(n_estimators=100))
        model3 = normal_model(TabNetRegressor(verbose=0),True)

        model4 = normal_model(GradientBoostingRegressor(learning_rate=0.01,n_estimators=100))
        model5 = normal_model(KNeighborsRegressor(weights='distance'))
        model6 = normal_model(svm.SVR(C=30,epsilon=0.06,gamma=0.4))
        model7 = normal_model(XGBRegressor())
        model8 = normal_model(RandomForestRegressor(n_estimators=100))
        model9 = normal_model(lgb.LGBMRegressor(metric='rmse'))
        model10 = normal_model(CatBoostRegressor(
            iterations=1000,
            learning_rate=0.1,
            depth=7,
            loss_function='RMSE',
            verbose=100
        ))
        model11 = normal_model(AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=500))
        model12 = normal_model(LCERegressor())
        model13 = normal_model(MLPRegressor(solver='adam', alpha=1e-5,hidden_layer_sizes=(64, 64), random_state=np.random.random_integers(0,100000,1)[0]))


        model1.fit(train_data, train_labels)
        model2.fit(train_data, train_labels)
        model3.fit(train_data, train_labels)
        model4.fit(train_data, train_labels)
        model5.fit(train_data, train_labels)
        model6.fit(train_data, train_labels)
        model7.fit(train_data, train_labels)
        model8.fit(train_data, train_labels)
        model9.fit(train_data, train_labels)
        model10.fit(train_data, train_labels)
        model11.fit(train_data, train_labels)
        model12.fit(train_data, train_labels)
        model13.fit(train_data, train_labels)


        result[0,i,:] = model1.myscore(test_data, test_labels)
        result[1,i,:] = model2.myscore(test_data, test_labels)
        result[2,i,:] = model3.myscore(test_data, test_labels)
        result[3,i,:] = model4.myscore(test_data, test_labels)
        result[4,i,:] = model5.myscore(test_data, test_labels)
        result[5,i,:] = model6.myscore(test_data, test_labels)
        result[6,i,:] = model7.myscore(test_data, test_labels)
        result[7,i,:] = model8.myscore(test_data, test_labels)
        result[8,i,:] = model9.myscore(test_data, test_labels)
        result[9,i,:] = model10.myscore(test_data, test_labels)
        result[10,i,:] = model11.myscore(test_data, test_labels)
        result[11,i,:] = model12.myscore(test_data, test_labels)
        result[12,i,:] = model13.myscore(test_data, test_labels)

    result = np.mean(result,axis=1)
    result = result.T

    df = pd.DataFrame(result,columns=colomn)
    df.to_csv('K_fold\\'+str(epoch+1)+'.csv',index=False)

In [None]:
# Compute the statistical features

statis = np.zeros((4,13,num_epoch))
for epoch in range(num_epoch):
    statis[:,:,epoch] = np.array(pd.read_csv('K_fold\\'+str(epoch+1)+'.csv'))

statis_mean = np.nanmean(statis,axis=2)
statis_std = np.nanstd(statis,axis=2)

df = pd.DataFrame(statis_mean,columns=colomn)
df.to_csv('statis_mean.csv',index=False)
df = pd.DataFrame(statis_std,columns=colomn)
df.to_csv('statis_std.csv',index=False)

In [None]:
'''
Importance analysis

You need to create a folder named 'importance'
'''

num_epoch = 100
importances = np.zeros((num_epoch,6,3))
column = ['knn','et','tab']
for epoch in range(num_epoch):
    data_x1,data_yraw = average_choose_data(data_x11,data_yraw1,
                                              [ 0.056, 0.123 , 0.1896, 0.2561, 0.3895],
                                              4,36,np.random.random_integers(0,100000,1)[0])
    data_xraw = raw2process(data_x1)

    model1 = normal_model(KNeighborsRegressor(weights='uniform'))
    model2 = normal_model(ExtraTreesRegressor(n_estimators=100))
    model3 = normal_model(TabNetRegressor(verbose=0),True)

    importances[epoch,:,0] = model1.importance(data_xraw,data_yraw)
    importances[epoch,:,1] = model2.importance(data_xraw,data_yraw)
    importances[epoch,:,2] = model3.importance(data_xraw,data_yraw)

    df = pd.DataFrame(importances[epoch,:,:],columns=column)
    df.to_csv('importance\\'+str(epoch+1)+'.csv',index=False)

importances = np.mean(importances,axis=0)
# print(importances)
df = pd.DataFrame(importances,columns=column)
df.to_csv('importance.csv',index=False)
