In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (11, 11)  #set default figure size
import numpy as np
import os 
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, DotProduct, ConstantKernel as C
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from scipy.optimize import leastsq
import matplotlib.pylab as plt
from sklearn import preprocessing



In [2]:
def mean_absolute_percentage_error(y, y_pred) :
    
    if np.where(y==0)[0] != np.nan:
        ind = np.where(y==0)[0]
        y = np.delete(y, ind)
        y_pred = np.delete(y_pred, ind)
    MAPE = 100*np.average(abs(y_pred)/abs(y))
    return MAPE

In [3]:
## Seperate training and test data
def split(xx, yy):
    zscore = StandardScaler()        
    scaler =MinMaxScaler()   
    xx = zscore.fit_transform(xx)  
    yy = scaler.fit_transform(np.reshape(yy.values,(len(yy),1))) 
    x_train, x_test_val, y_train, y_test_val = train_test_split(xx, yy, shuffle=True,test_size=0.4, random_state=1)
    x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, shuffle=True,test_size=0.25, random_state=1)
    y2 = yy.ravel()
    ind_train = [np.argwhere(y2 == y_train[i])[0].item() for i in range(len(y_train)) if y_train[i] in y2]
    ind_test = [np.argwhere(y2 == y_test[i])[0].item() for i in range(len(y_test)) if y_test[i] in y2]
    ind_val = [np.argwhere(y2 == y_val[i])[0].item() for i in range(len(y_val)) if y_val[i] in y2]
    y_train = y_train.ravel()
    y_test = y_test.ravel()
    y_val = y_val.ravel()
    return x_train, x_test,x_val, y_train, y_test,y_val, xx, yy, np.asarray(ind_train), np.asarray(ind_test), np.asarray(ind_val)

In [4]:
def scatter_plots(model_name,y_pred, y_pred_train, y_pred_test, y_pred_val, yy, y_scaled, y_train, y_test, y_val) :
    R2_score = r2_score(y_scaled,y_pred)
    y_pred2 = inverse_scaler(y_pred,yy)
    y_pred_train2 = inverse_scaler(y_pred_train,yy)
    y_pred_test2 = inverse_scaler(y_pred_test,yy)
    y_pred_val2 = inverse_scaler(y_pred_val,yy)
    plt.scatter(inverse_scaler(y_train,yy),y_pred_train2, label='train')
    plt.scatter(inverse_scaler(y_test,yy),y_pred_test2, label='test')
    plt.scatter(inverse_scaler(y_val,yy),y_pred_val2, label='val')
    plt.plot(np.linspace(0.1,10,10),np.linspace(0.1,10,10),c='r', label='1:1')
    labels = df.index
    values = []
    drop_ind = []
    i = 0
    for label in labels:
        if np.round(y_pred2[i],2) not in values :
            values.append(np.round(y_pred2[i],3))
            a = 5
        else:
            a = a + 15
        pos = (a, a)
        plt.annotate(label, xy=(yy[label],y_pred2[i]),xytext=pos, textcoords='offset points', ha='left', va='bottom',size=20)
        i = i+1
    plt.title(model_name)
    plt.legend()
    plt.xlabel('True values')
    plt.ylabel('predictive values')

In [6]:
df = pd.read_csv('data_6.csv', index_col=None, engine='c') 
print('original datasize:', df.shape)
df.dropna(inplace=True)
print('adjusted datasize:', df.shape)
print('data:')
print(df.head())

original datasize: (475, 11)
adjusted datasize: (474, 11)
data:
           Kd   SSA        C%       H/C       O/C     E     S     A     B  \
0   24.014337  0.04  0.683575  0.147232  0.157833  1.73  1.55  0.47  0.45   
1   37.981913  0.04  0.632581  0.139353  0.220736  1.73  1.55  0.47  0.45   
2   50.563768  0.04  0.592297  0.132170  0.278086  1.73  1.55  0.47  0.45   
3  103.582271  0.04  0.444276  0.094589  0.578131  1.73  1.55  0.47  0.45   
4   66.880171  0.04  0.654811  0.142939  0.192110  1.73  1.55  0.47  0.45   

        V   log(Kd)  
0  1.8088  1.380471  
1  1.8088  1.579577  
2  1.8088  1.703839  
3  1.8088  2.015285  
4  1.8088  1.825297  


In [7]:
yy=df.iloc[:,-1]# target - kd
xx= df.iloc[:,1:-1] # predictors
x_train, x_test,x_val, y_train, y_test,y_val, x_scaled, y_scaled, ind_train, ind_test,ind_val = split(xx, yy)


print('target is ', yy.name)
print('predictors are ', xx.columns.tolist())
print('samples size is ', xx.shape[0], '; number of predictors is ', xx.shape[1])
print('training data size is ', x_train.shape[0], '; test data size is ', x_test.shape[0],'; validation data size is ', x_val.shape[0])

target is  log(Kd)
predictors are  ['SSA', 'C%', 'H/C', 'O/C', 'E', 'S', 'A', 'B', 'V']
samples size is  474 ; number of predictors is  9
training data size is  284 ; test data size is  142 ; validation data size is  48


In [None]:
d = np.ones(x_train.shape[1])
kernel = C(1.0, (1e-2, 1e2)) * RBF(d,(1e-6, 1e6))+ WhiteKernel(noise_level=0.5)
#kernel =DotProduct(sigma_0=0, sigma_0_bounds='fixed')+WhiteKernel(0.001)
gpr = GaussianProcessRegressor(kernel=kernel, random_state=0, n_restarts_optimizer=100).fit(x_train, y_train)

In [None]:
y_pred = gpr.predict(x_scaled)
scatter_plots('GPR',y_pred, y_pred_train, y_pred_test, y_pred_val, yy, y_scaled, y_train, y_test, y_val) 

In [None]:
import pickle
# save gpr model
model_name = 'gpr.pkl' # 模型名称.plk
path =''
with open(path+'gpr.pkl','wb') as f:
    pickle.dump(gpr, f)