In [None]:
#functions needed for model building
import itertools
import numpy as np
import tensorflow as tf
from tensorflow import keras
from aquarel import load_theme
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model, load_model
from sklearn.ensemble import GradientBoostingRegressor as GBR

# 生成基模型
def getBaseModel(X_train,y_train,X_test,y_test,unit,epochs=120,learning_rate=0.00005,batch_size=32,k=10,fig=True):
    '''
    训练返回基模型
    '''
    x_train = X_train
    y_train = y_train
    x_test = X_test
    y_test = y_test
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    sklearn_model = build_model(x_test_scaled,learning_rate=learning_rate)
    sklearn_model.summary()
    
    history = sklearn_model.fit(x_train_scaled, y_train,
                            epochs=epochs,
                            validation_data=(x_test_scaled, y_test),
#                             batch_size=batch_size,   
                            verbose=0)
    # 绘制学习曲线
    print(history.params)
    print_history(history) #调用绘图函数
    
    y_test_pred = sklearn_model.predict(x_test_scaled)
    y_train_pred = sklearn_model.predict(x_train_scaled)

    print(f'Train_rmse: {mean_squared_error(y_train_pred, y_train,squared=False)}')
    print(f'Train_R2: {r2_score(y_train_pred, y_train)}') 
    print(f'Test_rmse: {mean_squared_error(y_test_pred, y_test,squared=False)}')
    print(f'Test_R2: {r2_score(y_test_pred, y_test)}')
    print(f'Train Dataset: {len(x_train_scaled)}')
    if fig:
        figAccScatter(y_test,y_test_pred,unit=unit)

    return sklearn_model

def ANN_DL_byCV(X,y,unit,k=10,patience=80,learning_rate=0.00005,fig=True):
    '''
    ANN 交叉验证直接训练
    '''
    #交叉验证训练
    kfold =  KFold(n_splits=k, shuffle=True
                      , random_state=1234
                     )
    
    y_test_pred_list,y_test_true_list = [],[]
    y_train_rmse_list,y_train_r2_list,y_test_rmse_list,y_test_r2_list = [],[],[],[]
    count = 0
    for train_i, test_i in kfold.split(X):
        count+=1
        print(f'-------================== LOOP {count}/{k} ===================-------')
        x_train = X.iloc[train_i,:]
        y_train = y[train_i]
        x_test = X.iloc[test_i,:]
        y_test = y[test_i]
        scaler = StandardScaler()
        x_train_scaled = scaler.fit_transform(x_train)
        x_test_scaled = scaler.transform(x_test)

        sklearn_model = build_model(x_test_scaled,learning_rate=learning_rate)
        sklearn_model.summary()
        callbacks = [keras.callbacks.EarlyStopping(patience=patience, min_delta=1e-2)]

        history = sklearn_model.fit(x_train_scaled, y_train,
                                epochs=1000,
                                validation_data=(x_test_scaled, y_test),
                                callbacks=callbacks,
                                   verbose=0)
                # 绘制学习曲线
        print(history.params)
        print_history(history) #调用绘图函数
    
        y_test_pred = sklearn_model.predict(x_test_scaled)
        y_train_pred = sklearn_model.predict(x_train_scaled)
        y_test_pred_list.append(y_test_pred)
        y_test_true_list.append(y_test)
        
        y_train_rmse_list.append(mean_squared_error(y_train_pred, y_train,squared=False))
        y_train_r2_list.append(r2_score(y_train_pred, y_train)) 
    y_test_pred_list=list(itertools.chain(*y_test_pred_list))
    y_test_true_list=list(itertools.chain(*y_test_true_list))
    test_rmse = mean_squared_error(y_test_true_list, y_test_pred_list,squared=False)
    test_R2 = r2_score(y_test_true_list, y_test_pred_list)
    train_rmse = np.mean(y_train_rmse_list)
    train_R2 = np.mean(y_train_r2_list)
    print('Train rmse: ',train_rmse)
    print('Train R2: ',train_R2)
    print('Test rmse: ',test_rmse)
    print('Test R2: ',test_R2)
    if fig:
        figAccScatter(y_test_true_list,y_test_pred_list,unit=unit)
    return [train_rmse,train_R2,test_rmse,test_R2]

def ANN_DL_byLOO(X,y,unit,learning_rate=0.00005,patience=80,fig=True):
    '''
    用于ANN模型留一法直接训练
    '''
    loo = LeaveOneOut()
    y_test_pred_list,y_test_true_list = [],[]
    y_train_rmse_list,y_train_r2_list = [],[]
    count=0
    for train_i, test_i in loo.split(X):
        count+=1
        print(f'-------================== LOOP {count}/{len(y)} ===================-------')
        x_train = X.iloc[train_i,:]
        y_train = y[train_i]
        x_test = X.iloc[test_i,:]
        y_test = y[test_i]
        scaler = StandardScaler()
        x_train_scaled = scaler.fit_transform(x_train)
        x_test_scaled = scaler.transform(x_test)

        sklearn_model = build_model(x_test_scaled,learning_rate=learning_rate)
        sklearn_model.summary()
        callbacks = [keras.callbacks.EarlyStopping(patience=patience, min_delta=1e-2)]

        history = sklearn_model.fit(x_train_scaled, y_train,
                                epochs=1000,
                                validation_data=(x_test_scaled, y_test),
                                callbacks=callbacks,
                                   verbose=0)
                # 绘制学习曲线
        print(history.params)
        print_history(history) #调用绘图函数
        
        y_test_pred = sklearn_model.predict(x_test_scaled)
        y_train_pred = sklearn_model.predict(x_train_scaled)
        y_test_pred_list.append(y_test_pred)
        y_test_true_list.append(y_test)
        
        y_train_rmse_list.append(mean_squared_error(y_train_pred, y_train,squared=False))
        y_train_r2_list.append(r2_score(y_train_pred, y_train)) 
    y_test_pred_list=list(itertools.chain(*y_test_pred_list))
    y_test_true_list=list(itertools.chain(*y_test_true_list))
    test_rmse = mean_squared_error(y_test_true_list, y_test_pred_list,squared=False)
    test_R2 = r2_score(y_test_true_list, y_test_pred_list)
    train_rmse = np.mean(y_train_rmse_list)
    train_R2 = np.mean(y_train_r2_list)
    print('Train rmse: ',train_rmse)
    print('Train R2: ',train_R2)
    print('Test rmse: ',test_rmse)
    print('Test R2: ',test_R2)
    if fig:
        figAccScatter(y_test_true_list,y_test_pred_list,unit=unit)
    return [train_rmse,train_R2,test_rmse,test_R2]

def ANN_DL_byTest(X_train,y_train,X_test,y_test,unit,epochs=120,learning_rate=0.00005,batch_size=32,fig=True):
    '''
    独立测试集验证ANN直接学习
    '''
    x_train = X_train
    y_train = y_train
    x_test = X_test
    y_test = y_test
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    sklearn_model = build_model(x_test_scaled,learning_rate=learning_rate)
    sklearn_model.summary()
    callbacks = [keras.callbacks.EarlyStopping(patience=80, min_delta=1e-2)]

    history = sklearn_model.fit(x_train_scaled, y_train,
                            epochs=epochs,
                            validation_data=(x_test_scaled, y_test),
#                             callbacks=callbacks,
                               verbose=0)
            # 绘制学习曲线
    print(history.params)
    print_history(history) #调用绘图函数
    
    y_test_pred = sklearn_model.predict(x_test_scaled)
    y_train_pred = sklearn_model.predict(x_train_scaled)

    print(f'Train_rmse: {mean_squared_error(y_train_pred, y_train,squared=False)}')
    print(f'Train_R2: {r2_score(y_train_pred, y_train)}') 
    print(f'Test_rmse: {mean_squared_error(y_test_pred, y_test,squared=False)}')
    print(f'Test_R2: {r2_score(y_test_pred, y_test)}')
    if fig:
        figAccScatter(y_test,y_test_pred,unit=unit)

    return sklearn_model



# 迁移学习后观察交叉验证结果
def ANN_TL_byCV(X,y,importbaseModelPath,unit,patience=80,k=10,importbaseModel=None,learning_rate=0.00005,batch_size=32,fig=True):
    '''
    交叉验证迁移学习
    '''
    #交叉验证训练
    kfold =  KFold(n_splits=k, shuffle=True
                      , random_state=1234
                     )
    
    y_test_pred_list,y_test_true_list = [],[]
    y_train_rmse_list,y_train_r2_list,y_test_rmse_list,y_test_r2_list = [],[],[],[]
    count = 0
    for train_i, test_i in kfold.split(X):
        count+=1
        x_train = X.iloc[train_i,:]
        y_train = y[train_i]
        x_test = X.iloc[test_i,:]
        y_test = y[test_i]
        scaler = StandardScaler()
        x_train_scaled = scaler.fit_transform(x_train)
        x_test_scaled = scaler.transform(x_test)
        #迁移模型
        if importbaseModel == None:
            print(f'---------========== learning by import baseModel file {count}/{k}============-----------')
            sklearn_model=None
            sklearn_model = build_model(x_test_scaled,learning_rate=learning_rate)
            sklearn_model.load_weights(importbaseModelPath)
            newl_last_1 = Dense(1, activation=None)(sklearn_model.layers[-2].output)  
            sklearn_model = Model(inputs=sklearn_model.input, outputs=[newl_last_1])
        else:
            print(f'---------========== learning by import baseModel variables(object) {count}/{k}===========------------')
            sklearn_model=None
            newl_last_1 = Dense(1, activation=None)(importbaseModel.layers[-2].output)  
            sklearn_model = Model(inputs=importbaseModel.input, outputs=[newl_last_1])
        
        sklearn_model.summary()
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        sklearn_model.compile(loss='mse', optimizer=optimizer,
                              metrics=[keras.metrics.RootMeanSquaredError()]
                             )
        callbacks = [keras.callbacks.EarlyStopping(patience=patience, min_delta=1e-2)]

        history = sklearn_model.fit(x_train_scaled, y_train,
                                epochs=1000,
#                                 batch_size=batch_size,
                                validation_data=(x_test_scaled, y_test),
                                callbacks=callbacks,
                                   verbose=0)
            # 绘制学习曲线
        print(history.params)
        print_history(history) #调用绘图函数
        
        y_test_pred = sklearn_model.predict(x_test_scaled)
        y_train_pred = sklearn_model.predict(x_train_scaled)
        y_test_pred_list.append(y_test_pred)
        y_test_true_list.append(y_test)
        
        y_train_rmse_list.append(mean_squared_error(y_train_pred, y_train,squared=False))
        y_train_r2_list.append(r2_score(y_train_pred, y_train)) 
    y_test_pred_list=list(itertools.chain(*y_test_pred_list))
    y_test_true_list=list(itertools.chain(*y_test_true_list))
    test_rmse = mean_squared_error(y_test_true_list, y_test_pred_list,squared=False)
    test_R2 = r2_score(y_test_true_list, y_test_pred_list)
    train_rmse = np.mean(y_train_rmse_list)
    train_R2 = np.mean(y_train_r2_list)
    print('Train rmse: ',train_rmse)
    print('Train R2: ',train_R2)
    print('Test rmse: ',test_rmse)
    print('Test R2: ',test_R2)
    if fig:
        figAccScatter(y_test_true_list,y_test_pred_list,unit=unit)
    return [train_rmse,train_R2,test_rmse,test_R2]

#模型定义函数

def ANN_TL_byLOO(X,y,importbaseModelPath,unit,patience=80,k=10,importbaseModel=None,learning_rate=0.00005,batch_size=32,fig=True):
    '''
    留一法验证迁移学习
    '''
    
    loo = LeaveOneOut()
    
    y_test_pred_list,y_test_true_list = [],[]
    y_train_rmse_list,y_train_r2_list,y_test_rmse_list,y_test_r2_list = [],[],[],[]
    count = 0
    for train_i, test_i in loo.split(X):
        count+=1
        x_train = X.iloc[train_i,:]
        y_train = y[train_i]
        x_test = X.iloc[test_i,:]
        y_test = y[test_i]
        scaler = StandardScaler()
        x_train_scaled = scaler.fit_transform(x_train)
        x_test_scaled = scaler.transform(x_test)
       
        #迁移模型
        if importbaseModel == None:
            print(f'---------========== learning by import baseModel file {count}/{len(y)}============-----------')
            sklearn_model=None
            sklearn_model = build_model(x_test_scaled,learning_rate=learning_rate)
            sklearn_model.load_weights(importbaseModelPath)
            newl_last_1 = Dense(1, activation=None)(sklearn_model.layers[-2].output)  
            sklearn_model = Model(inputs=sklearn_model.input, outputs=[newl_last_1])
        else:
            print(f'---------========== learning by import baseModel variables(object) {count}/{len(y)}===========------------')
            sklearn_model=None
            newl_last_1 = Dense(1, activation=None)(importbaseModel.layers[-2].output)  
            sklearn_model = Model(inputs=importbaseModel.input, outputs=[newl_last_1])
        
        sklearn_model.summary()
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        sklearn_model.compile(loss='mse', optimizer=optimizer,metrics=[keras.metrics.RootMeanSquaredError()])
        callbacks = [keras.callbacks.EarlyStopping(patience=patience, min_delta=1e-2)]

        history = sklearn_model.fit(x_train_scaled, y_train,
                                epochs=1000,
                                validation_data=(x_test_scaled, y_test),
#                                     batch_size=batch_size,
                                callbacks=callbacks,
                                   verbose=0)
        # 绘制学习曲线
        print(history.params)
        print_history(history) #调用绘图函数
        
        y_test_pred = sklearn_model.predict(x_test_scaled)
        y_train_pred = sklearn_model.predict(x_train_scaled)
        y_test_pred_list.append(y_test_pred)
        y_test_true_list.append(y_test)
        
        y_train_rmse_list.append(mean_squared_error(y_train_pred, y_train,squared=False))
        y_train_r2_list.append(r2_score(y_train_pred, y_train)) 
        
    y_test_pred_list=list(itertools.chain(*y_test_pred_list))
    y_test_true_list=list(itertools.chain(*y_test_true_list))
    test_rmse = mean_squared_error(y_test_true_list, y_test_pred_list,squared=False)
    test_R2 = r2_score(y_test_true_list, y_test_pred_list)
    train_rmse = np.mean(y_train_rmse_list)
    train_R2 = np.mean(y_train_r2_list)
    print('Train rmse: ',train_rmse)
    print('Train R2: ',train_R2)
    print('Test rmse: ',test_rmse)
    print('Test R2: ',test_R2)
    if fig:
        figAccScatter(y_test_true_list,y_test_pred_list,unit=unit)
    return [train_rmse,train_R2,test_rmse,test_R2]


def ANN_TL_byTest(X_train,y_train,X_test,y_test,importbaseModelPath,unit,epochs=120,importbaseModel=None,learning_rate=0.00005,batch_size=32,fig=True):
    '''
    独立测试集验证迁移学习
    '''
    x_train = X_train
    y_train = y_train
    x_test = X_test
    y_test = y_test
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    print(y_test)
#迁移模型
    if importbaseModel == None:
        print(f'---------========== learning by import baseModel file ============-----------')
        sklearn_model=None
        sklearn_model = build_model(x_test_scaled,learning_rate=learning_rate)
        sklearn_model.load_weights(importbaseModelPath)
        newl_last_1 = Dense(1, activation=None)(sklearn_model.layers[-2].output)  
        sklearn_model = Model(inputs=sklearn_model.input, outputs=[newl_last_1])
    else:
        print(f'---------========== learning by import baseModel variables(object) ===========------------')
        sklearn_model=None
        newl_last_1 = Dense(1, activation=None)(importbaseModel.layers[-2].output)  
        sklearn_model = Model(inputs=importbaseModel.input, outputs=[newl_last_1])
    
    sklearn_model.summary()
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    sklearn_model.compile(loss='mse', optimizer=optimizer,metrics=[keras.metrics.RootMeanSquaredError()])
#     callbacks = [keras.callbacks.EarlyStopping(patience=patience, min_delta=1e-2)]

    
    history = sklearn_model.fit(x_train_scaled, y_train,
                            epochs=epochs,
                            validation_data=(x_test_scaled, y_test),
#                             batch_size=batch_size,   
                            verbose=2)
    
    
    # 绘制学习曲线
    print(history.params)
    print_history(history) #调用绘图函数
    
    y_test_pred = sklearn_model.predict(x_test_scaled)
    y_train_pred = sklearn_model.predict(x_train_scaled)

    print(f'Train_rmse: {mean_squared_error(y_train_pred, y_train,squared=False)}')
    print(f'Train_R2: {r2_score(y_train_pred, y_train)}') 
    print(f'Test_rmse: {mean_squared_error(y_test_pred, y_test,squared=False)}')
    print(f'Test_R2: {r2_score(y_test_pred, y_test)}')
    if fig:
        figAccScatter(y_test,y_test_pred,unit=unit)

    return sklearn_model

#传统算法GBRT直接训练
def GBRT_DL_byLOO(X,y,params,unit,fig=True):
    '''
    用于GBRT模型留一法训练
    '''
    loo = LeaveOneOut()
    gbr = GBR(**params)
    y_test_pred_list,y_test_true_list = [],[]
    y_train_rmse_list,y_train_r2_list = [],[]
    for train_i, test_i in loo.split(X):
        x_train = X.iloc[train_i,:]
        y_train = y[train_i]
        x_test = X.iloc[test_i,:]
        y_test = y[test_i]
        gbr.fit(X.iloc[train_i,:], y[train_i])
        y_test_pred = gbr.predict(X.iloc[test_i,:])
        y_train_pred = gbr.predict(X.iloc[train_i,:])
        y_test_pred_list.append(y_test_pred)
        y_test_true_list.append(y[test_i])
        
        y_train_rmse_list.append(mean_squared_error(y_train_pred, y_train,squared=False))
        y_train_r2_list.append(r2_score(y_train_pred, y_train)) 
    y_test_pred_list=list(itertools.chain(*y_test_pred_list))
    y_test_true_list=list(itertools.chain(*y_test_true_list))
    test_rmse = mean_squared_error(y_test_true_list, y_test_pred_list,squared=False)
    test_R2 = r2_score(y_test_true_list, y_test_pred_list)
    train_rmse = np.mean(y_train_rmse_list)
    train_R2 = np.mean(y_train_r2_list)
    print('Train rmse: ',train_rmse)
    print('Train R2: ',train_R2)
    print('Test rmse: ',test_rmse)
    print('Test R2: ',test_R2)
    if fig:
        figAccScatter(y_test_true_list,y_test_pred_list,unit=unit)
    return [train_rmse,train_R2,test_rmse,test_R2]

def GBRT_DL_byCV(X,y,params,unit,fig=True,k=10):
    '''
    用于GBRT模型交叉验证训练
    '''
    kfold =  KFold(n_splits=k, shuffle=True
                      , random_state=1234
                     )
    gbr = GBR(**params)
    y_test_pred_list,y_test_true_list = [],[]
    y_train_rmse_list,y_train_r2_list,y_test_rmse_list,y_test_r2_list = [],[],[],[]
    for train_i, test_i in kfold.split(X):
        gbr.fit(X.iloc[train_i,:], y[train_i])
        y_test_pred = gbr.predict(X.iloc[test_i,:])
        y_train_pred = gbr.predict(X.iloc[train_i,:])
        y_test_pred_list.append(y_test_pred)
        y_test_true_list.append(y[test_i])
        
        y_train_rmse_list.append(mean_squared_error(y_train_pred, y[train_i],squared=False))
        y_train_r2_list.append(r2_score(y_train_pred, y[train_i])) 
    y_test_pred_list=list(itertools.chain(*y_test_pred_list))
    y_test_true_list=list(itertools.chain(*y_test_true_list))
    test_rmse = mean_squared_error(y_test_true_list, y_test_pred_list,squared=False)
    test_R2 = r2_score(y_test_true_list, y_test_pred_list)
    train_rmse = np.mean(y_train_rmse_list)
    train_R2 = np.mean(y_train_r2_list)
    print('Train rmse: ',train_rmse)
    print('Train R2: ',train_R2)
    print('Test rmse: ',test_rmse)
    print('Test R2: ',test_R2)
    if fig:
        figAccScatter(y_test_true_list,y_test_pred_list,unit=unit)
    return [train_rmse,train_R2,test_rmse,test_R2]


def GBRT_DL_byTest(X_train,y_train,X_test,y_test,params,unit,fig=True):
    '''
    独立测试集验证GBRT直接学习
    '''
    
    gbr = GBR(**params)
    gbr.fit(X_train,y_train)
    y_train_pred = gbr.predict(X_train)
    y_test_pred = gbr.predict(X_test)
    
    test_rmse = mean_squared_error(y_test_pred, y_test,squared=False)
    test_R2 = r2_score(y_test_pred, y_test)
    train_rmse = mean_squared_error(y_train_pred, y_train,squared=False)
    train_R2 = r2_score(y_train_pred, y_train)
    print('Train rmse: ',train_rmse)
    print('Train R2: ',train_R2)
    print('Test rmse: ',test_rmse)
    print('Test R2: ',test_R2)
    if fig:
        figAccScatter(y_test,y_test_pred,unit=unit)

    return [train_rmse,train_R2,test_rmse,test_R2]



def figAccScatter(y_test_true_list,y_test_pred_list,unit,s=70,color='#227C70'):
    # 画散点图
    theme = load_theme("boxy_light")
    theme.apply()
    plt.rcParams['font.sans-serif']=['Times New Roman']
    fig = plt.figure(figsize=[6,5],dpi=500)
    ax = plt.gca()

    print(len(y_test_pred_list))
    print('--------------')
    print(len(y_test_true_list))
    g = plt.scatter(y_test_true_list,y_test_pred_list,s=s,color=color)
    plt.xticks(size=15)
    plt.yticks(size=15)
#     plt.xlim([np.min(y_test_true_list),np.max(y_test_true_list)])
#     plt.ylim([np.min(y_test_pred_list),np.max(y_test_pred_list)])
    
    plt.xlabel(f'DFT value ({unit})',fontdict={'size':20})
    plt.ylabel(f'ML predicted ({unit})',fontdict={'size':20})
    x = np.arange(np.min(y_test_true_list),np.max(y_test_true_list),0.01)
    y = x
    plt.plot(x,y,'--',color='black',linewidth=1)
    bwith=1
    ax.spines['bottom'].set_linewidth(bwith)
    ax.spines['left'].set_linewidth(bwith)
    ax.spines['top'].set_linewidth(bwith)
    ax.spines['right'].set_linewidth(bwith)
    plt.show()
    theme.apply_transforms()
    
#绘图函数
def print_history(history):
    # 绘制训练 & 验证的准确率值
    fig = plt.figure(figsize=[6,5],dpi=300)
    ax = plt.gca()
    plt.plot(history.history['root_mean_squared_error'])
    plt.plot(history.history['val_root_mean_squared_error'])
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model rmse&loss')
    plt.xlabel('Epoch')
    plt.legend(['Train_rmse', 'Val_rmse', 'Train_loss', 'Val_loss'])
    plt.show()

def build_model(x_train,
            hidden_layers=5,
            layer_0_size=100,
            layer_1_size=300,
            layer_2_size=300,
            layer_3_size=300,
             layer_4_size=100,
             layer_5_size=1024,
             layer_6_size=1024,
             layer_7_size=1024,
             layer_8_size=1024,
            learning_rate=0.00005,seed=123):
    '''
    ANN框架返回函数
    '''
    layer_size = [layer_0_size, layer_1_size, layer_2_size,
                  layer_3_size,layer_4_size,layer_5_size,layer_6_size
                 ,layer_7_size,layer_8_size]
    tf.random.set_seed(seed)
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(layer_size[0], activation='relu',
                                 input_shape=x_train.shape[1:]))
    for i in range(hidden_layers - 1):
        model.add(keras.layers.Dense(layer_size[i+1],
                                     activation='relu'))
    model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.Adam(learning_rate)
#     optimizer = keras.optimizers.SGD(learning_rate)
    model.compile(loss='mse', optimizer=optimizer,metrics=[keras.metrics.RootMeanSquaredError()])
    return model

In [None]:
#Ehull
import itertools
import numpy as np
import tensorflow as tf
from tensorflow import keras
from aquarel import load_theme
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model, load_model
from sklearn.ensemble import GradientBoostingClassifier as GBC

# 生成基模型
def getBaseModelClassifier(X_train,y_train,X_test,y_test,unit,epochs=120,learning_rate=0.00005,batch_size=32,k=10,fig=True):
    '''
    训练返回基模型
    '''
    x_train = X_train
    y_train = y_train
    x_test = X_test
    y_test = y_test
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    sklearn_model = build_modelClassifier(x_test_scaled,learning_rate=learning_rate)
    sklearn_model.summary()
    
    history = sklearn_model.fit(x_train_scaled, y_train,
                            epochs=epochs,
                            validation_data=(x_test_scaled, y_test),
#                             batch_size=batch_size,   
                            verbose=0)
    # 绘制学习曲线
    print(history.params)
    print_historyClassifier(history) #调用绘图函数
    
    y_test_pred = sklearn_model.predict(x_test_scaled)
    y_train_pred = sklearn_model.predict(x_train_scaled)

    print(f'Train_rmse: {mean_squared_error(y_train_pred, y_train,squared=False)}')
    print(f'Train_R2: {r2_score(y_train_pred, y_train)}') 
    print(f'Test_rmse: {mean_squared_error(y_test_pred, y_test,squared=False)}')
    print(f'Test_R2: {r2_score(y_test_pred, y_test)}')
    print(f'Train Dataset: {len(x_train_scaled)}')
    if fig:
        figAccScatter(y_test,y_test_pred,unit=unit)

    return sklearn_model

def ANNClassifier_DL_byCV(X,y,unit,k=10,patience=80,learning_rate=0.00005,fig=True):
    '''
    ANN 交叉验证直接训练
    '''
    #交叉验证训练
    kfold =  KFold(n_splits=k, shuffle=True
                      , random_state=1234
                     )
    
    y_test_pred_list,y_test_true_list = [],[]
    y_train_rmse_list,y_train_r2_list,y_test_rmse_list,y_test_r2_list = [],[],[],[]
    count = 0
    for train_i, test_i in kfold.split(X):
        count+=1
        print(f'-------================== LOOP {count}/{k} ===================-------')
        x_train = X.iloc[train_i,:]
        y_train = y[train_i]
        x_test = X.iloc[test_i,:]
        y_test = y[test_i]
        scaler = StandardScaler()
        x_train_scaled = scaler.fit_transform(x_train)
        x_test_scaled = scaler.transform(x_test)

        sklearn_model = build_model(x_test_scaled,learning_rate=learning_rate)
        sklearn_model.summary()
        callbacks = [keras.callbacks.EarlyStopping(patience=patience, min_delta=1e-2)]

        history = sklearn_model.fit(x_train_scaled, y_train,
                                epochs=1000,
                                validation_data=(x_test_scaled, y_test),
                                callbacks=callbacks,
                                   verbose=0)
                # 绘制学习曲线
        print(history.params)
        print_historyClassifier(history) #调用绘图函数
    
        y_test_pred = sklearn_model.predict(x_test_scaled)
        y_train_pred = sklearn_model.predict(x_train_scaled)
        y_test_pred_list.append(y_test_pred)
        y_test_true_list.append(y_test)
        
        y_train_rmse_list.append(mean_squared_error(y_train_pred, y_train,squared=False))
        y_train_r2_list.append(r2_score(y_train_pred, y_train)) 
    y_test_pred_list=list(itertools.chain(*y_test_pred_list))
    y_test_true_list=list(itertools.chain(*y_test_true_list))
    test_rmse = mean_squared_error(y_test_true_list, y_test_pred_list,squared=False)
    test_R2 = r2_score(y_test_true_list, y_test_pred_list)
    train_rmse = np.mean(y_train_rmse_list)
    train_R2 = np.mean(y_train_r2_list)
    print('Train rmse: ',train_rmse)
    print('Train R2: ',train_R2)
    print('Test rmse: ',test_rmse)
    print('Test R2: ',test_R2)
    if fig:
        figAccScatter(y_test_true_list,y_test_pred_list,unit=unit)
    return [train_rmse,train_R2,test_rmse,test_R2]



def ANNClassifier_DL_byTest(X_train,y_train,X_test,y_test,unit,epochs=120,learning_rate=0.00005,batch_size=32,fig=True):
    '''
    独立测试集验证ANN直接学习
    '''
    x_train = X_train
    y_train = y_train
    x_test = X_test
    y_test = y_test
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    sklearn_model = build_modelClassifier(x_test_scaled,learning_rate=learning_rate)
    sklearn_model.summary()
    callbacks = [keras.callbacks.EarlyStopping(patience=80, min_delta=1e-2)]

    history = sklearn_model.fit(x_train_scaled, y_train,
                            epochs=epochs,
                            validation_data=(x_test_scaled, y_test),
#                             callbacks=callbacks,
                               verbose=2)
            # 绘制学习曲线
    print(history.params)
    print_historyClassifier(history) #调用绘图函数
    
    y_test_pred = sklearn_model.predict(x_test_scaled)
    y_train_pred = sklearn_model.predict(x_train_scaled)
    loss_test, accuracy_test = sklearn_model.evaluate(x_test_scaled, y_test)
    loss_train, accuracy_train = sklearn_model.evaluate(x_train_scaled, y_train)
    test_pred_res = sklearn_model.predict(x_test_scaled)
    print(f'Train_accuracy: {accuracy_train}')
    print(f'Test_accuracy: {accuracy_test}')
 
    if fig:
        figAccScatter(y_test,y_test_pred,unit=unit)

    return sklearn_model,test_pred_res



# 迁移学习后观察交叉验证结果
def ANNClassifier_TL_byCV(X,y,importbaseModelPath,unit,patience=80,k=10,importbaseModel=None,learning_rate=0.00005,batch_size=32,fig=True):
    '''
    交叉验证迁移学习
    '''
    #交叉验证训练
    kfold =  KFold(n_splits=k, shuffle=True
                      , random_state=1234
                     )
    
    y_test_pred_list,y_test_true_list = [],[]
    y_train_rmse_list,y_train_r2_list,y_test_rmse_list,y_test_r2_list = [],[],[],[]
    count = 0
    for train_i, test_i in kfold.split(X):
        count+=1
        x_train = X.iloc[train_i,:]
        y_train = y[train_i]
        x_test = X.iloc[test_i,:]
        y_test = y[test_i]
        scaler = StandardScaler()
        x_train_scaled = scaler.fit_transform(x_train)
        x_test_scaled = scaler.transform(x_test)
        #迁移模型
        if importbaseModel == None:
            print(f'---------========== learning by import baseModel file {count}/{k}============-----------')
            sklearn_model=None
            sklearn_model = build_model(x_test_scaled,learning_rate=learning_rate)
            sklearn_model.load_weights(importbaseModelPath)
            newl_last_1 = Dense(1, activation=None)(sklearn_model.layers[-2].output)  
            sklearn_model = Model(inputs=sklearn_model.input, outputs=[newl_last_1])
        else:
            print(f'---------========== learning by import baseModel variables(object) {count}/{k}===========------------')
            sklearn_model=None
            newl_last_1 = Dense(1, activation=None)(importbaseModel.layers[-2].output)  
            sklearn_model = Model(inputs=importbaseModel.input, outputs=[newl_last_1])
        
        sklearn_model.summary()
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        sklearn_model.compile(loss='mse', optimizer=optimizer,
                              metrics=[keras.metrics.RootMeanSquaredError()]
                             )
        callbacks = [keras.callbacks.EarlyStopping(patience=patience, min_delta=1e-2)]

        history = sklearn_model.fit(x_train_scaled, y_train,
                                epochs=1000,
#                                 batch_size=batch_size,
                                validation_data=(x_test_scaled, y_test),
                                callbacks=callbacks,
                                   verbose=0)
            # 绘制学习曲线
        print(history.params)
        print_historyClassifier(history) #调用绘图函数
        
        y_test_pred = sklearn_model.predict(x_test_scaled)
        y_train_pred = sklearn_model.predict(x_train_scaled)
        y_test_pred_list.append(y_test_pred)
        y_test_true_list.append(y_test)
        
        y_train_rmse_list.append(mean_squared_error(y_train_pred, y_train,squared=False))
        y_train_r2_list.append(r2_score(y_train_pred, y_train)) 
    y_test_pred_list=list(itertools.chain(*y_test_pred_list))
    y_test_true_list=list(itertools.chain(*y_test_true_list))
    test_rmse = mean_squared_error(y_test_true_list, y_test_pred_list,squared=False)
    test_R2 = r2_score(y_test_true_list, y_test_pred_list)
    train_rmse = np.mean(y_train_rmse_list)
    train_R2 = np.mean(y_train_r2_list)
    print('Train rmse: ',train_rmse)
    print('Train R2: ',train_R2)
    print('Test rmse: ',test_rmse)
    print('Test R2: ',test_R2)
    if fig:
        figAccScatter(y_test_true_list,y_test_pred_list,unit=unit)
    return [train_rmse,train_R2,test_rmse,test_R2]




def ANNClassifier_TL_byTest(X_train,y_train,X_test,y_test,importbaseModelPath,unit,epochs=120,importbaseModel=None,learning_rate=0.00005,batch_size=32,fig=True):
    '''
    独立测试集验证迁移学习
    '''
    x_train = X_train
    y_train = y_train
    x_test = X_test
    y_test = y_test
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    print(y_test)
#迁移模型
    if importbaseModel == None:
        print(f'---------========== learning by import baseModel file ============-----------')
        sklearn_model=None
        sklearn_model = build_model(x_test_scaled,learning_rate=learning_rate)
        sklearn_model.load_weights(importbaseModelPath)
        newl_last_1 = Dense(1, activation=None)(sklearn_model.layers[-2].output)  
        sklearn_model = Model(inputs=sklearn_model.input, outputs=[newl_last_1])
    else:
        print(f'---------========== learning by import baseModel variables(object) ===========------------')
        sklearn_model=None
        newl_last_1 = Dense(1, activation='sigmoid')(importbaseModel.layers[-2].output)  
        sklearn_model = Model(inputs=importbaseModel.input, outputs=[newl_last_1])
    
    sklearn_model.summary()
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    sklearn_model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=['accuracy'])
#     callbacks = [keras.callbacks.EarlyStopping(patience=patience, min_delta=1e-2)]

    
    history = sklearn_model.fit(x_train_scaled, y_train,
                            epochs=epochs,
                            validation_data=(x_test_scaled, y_test),
#                             batch_size=batch_size,   
                            verbose=2)
    
    
    print_historyClassifier(history) #调用绘图函数
    
    y_test_pred = sklearn_model.predict(x_test_scaled)
    y_train_pred = sklearn_model.predict(x_train_scaled)
    loss_test, accuracy_test = sklearn_model.evaluate(x_test_scaled, y_test)
    loss_train, accuracy_train = sklearn_model.evaluate(x_train_scaled, y_train)
    test_pred_res = sklearn_model.predict(x_test_scaled)
    print(f'Train_accuracy: {accuracy_train}')
    print(f'Test_accuracy: {accuracy_test}')
    if fig:
        figAccScatter(y_test,y_test_pred,unit=unit)

    return sklearn_model,test_pred_res


def GBRTClassifier_DL_byCV(X,y,params,unit,fig=True,k=10):
    '''
    用于GBRT模型交叉验证训练
    '''
    kfold =  KFold(n_splits=k, shuffle=True
                      , random_state=1234
                     )
    gbr = GBR(**params)
    y_test_pred_list,y_test_true_list = [],[]
    y_train_rmse_list,y_train_r2_list,y_test_rmse_list,y_test_r2_list = [],[],[],[]
    for train_i, test_i in kfold.split(X):
        gbr.fit(X.iloc[train_i,:], y[train_i])
        y_test_pred = gbr.predict(X.iloc[test_i,:])
        y_train_pred = gbr.predict(X.iloc[train_i,:])
        y_test_pred_list.append(y_test_pred)
        y_test_true_list.append(y[test_i])
        
        y_train_rmse_list.append(mean_squared_error(y_train_pred, y[train_i],squared=False))
        y_train_r2_list.append(r2_score(y_train_pred, y[train_i])) 
    y_test_pred_list=list(itertools.chain(*y_test_pred_list))
    y_test_true_list=list(itertools.chain(*y_test_true_list))
    test_rmse = mean_squared_error(y_test_true_list, y_test_pred_list,squared=False)
    test_R2 = r2_score(y_test_true_list, y_test_pred_list)
    train_rmse = np.mean(y_train_rmse_list)
    train_R2 = np.mean(y_train_r2_list)
    print('Train rmse: ',train_rmse)
    print('Train R2: ',train_R2)
    print('Test rmse: ',test_rmse)
    print('Test R2: ',test_R2)
    if fig:
        figAccScatter(y_test_true_list,y_test_pred_list,unit=unit)
    return [train_rmse,train_R2,test_rmse,test_R2]


def GBRTClassifier_DL_byTest(X_train,y_train,X_test,y_test,params,unit,fig=True):
    '''
    独立测试集验证GBRT直接学习
    '''
    
    gbc = GBC(**params)
    gbc.fit(X_train,y_train)
    y_train_pred = gbc.predict(X_train)
    y_test_pred = gbc.predict(X_test)
    
    accuracy_test = gbc.score(X_test, y_test)
    accuracy_train = gbc.score(X_train,y_train)
    print(f'Train_accuracy: {accuracy_train}')
    print(f'Test_accuracy: {accuracy_test}')
    if fig:
        figAccScatter(y_test,y_test_pred,unit=unit)

    return gbc



def build_modelClassifier(x_train,
            hidden_layers=5,
            layer_0_size=100,
            layer_1_size=300,
            layer_2_size=300,
            layer_3_size=300,
             layer_4_size=100,
             layer_5_size=1024,
             layer_6_size=1024,
             layer_7_size=1024,
             layer_8_size=1024,
            learning_rate=0.00005,seed=123):
    '''
    ANN框架返回函数
    '''
    layer_size = [layer_0_size, layer_1_size, layer_2_size,
                  layer_3_size,layer_4_size,layer_5_size,layer_6_size
                 ,layer_7_size,layer_8_size]
    tf.random.set_seed(seed)
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(layer_size[0], activation='relu',
                                 input_shape=x_train.shape[1:]))
    for i in range(hidden_layers - 1):
        model.add(keras.layers.Dense(layer_size[i+1],
                                     activation='relu'))
    model.add(keras.layers.Dense(1,activation='sigmoid'))
    optimizer = keras.optimizers.Adam(learning_rate)
#     optimizer = keras.optimizers.SGD(learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=['accuracy'])
    return model

#绘图函数
def print_historyClassifier(history):
    # 绘制训练 & 验证的准确率值
    fig = plt.figure(figsize=[6,5],dpi=300)
    ax = plt.gca()
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model accuracy&loss')
    plt.xlabel('Epoch')
    plt.legend(['Train_accuracy', 'Val_accuracy', 'Train_loss', 'Val_loss'])
    plt.show()

In [None]:
BaseModel_B1_1 = getBaseModel(pd.concat([X_D1_train,X_D1_test],axis=0).reset_index(drop=True) ,pd.concat([y_Ef_D1_train,y_Ef_D1_test],axis=0).reset_index(drop=True) ,X_D1_test,y_Ef_D1_test,
                              unit='eV/atom',epochs=50,
                             learning_rate=0.00005
                             )

In [None]:
base_model_B1 = keras.models.load_model('保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.0251_TLbest_20221231.h5')

In [None]:
#bandgap
## GBRT交叉验证直接训练
GBRT_DL_byCV(X_D1_train,
             y_Bg_D1_train,
             params = {'n_estimators': 200,'subsample': 0.9, 'max_depth':5,
            'max_features':5,'random_state':1},
             unit='eV',fig=True,k=5)
GBRT_DL_byTest(X_D1_train,y_Bg_D1_train,X_D1_test,y_Bg_D1_test,
             params = {'n_estimators': 200,'subsample': 0.9, 'max_depth':5,
            'max_features':5,'random_state':1},
             unit='eV',fig=True)
## ANN 交叉验证直接训练
ANN_DL_byCV(X_D1_train,y_Bg_D1_train,unit='eV',learning_rate=0.0005,k=10,patience=80,fig=True)
## ANN 独立测试集直接训练
ANN_DL_byTest(X_D1_train,y_Bg_D1_train,X_D1_test,y_Bg_D1_test,unit='eV',epochs=120,learning_rate=0.0005,batch_size=32,fig=True)
#Transfer Learning
ANN_TL_byCV(X_D1_train,y_Bg_D1_train,importbaseModelPath='',unit='eV',
            importbaseModel = base_model_B1,
            learning_rate=0.0003,)
model_Bg_D1_B1_1_TL = ANN_TL_byTest(X_D1_train,y_Bg_D1_train,X_D1_test,y_Bg_D1_test,importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.0251_TLbest_20221231.h5',
              unit='eV',epochs=100,
            importbaseModel=base_model_B1,
            learning_rate=0.0005,)
model_Bg_D1_B1_1_TL = ANN_TL_byTest(X_D1_train,y_Bg_D1_train,X_D1_test,y_Bg_D1_test,importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.0251_TLbest_20221231.h5',
              unit='eV',epochs=100,
            importbaseModel=base_model_B1,
            learning_rate=0.0005,)

In [None]:
#ehull
## GBRT交叉验证直接训练
GBRT_DL_byCV(X_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
             y_ehull_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
             params = {'n_estimators': 200,'subsample': 0.9, 'max_depth':5,
            'max_features':5,'random_state':1},
             unit='eV/atom',fig=True,k=10)
GBRT_DL_byTest(X_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
               y_ehull_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
               X_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),
               y_ehull_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),
                params = {'n_estimators': 200,'subsample': 0.9, 'max_depth':5,
               'max_features':5,'random_state':1},
                unit='eV/atom',fig=True)
## ANN 交叉验证直接训练
ANN_DL_byCV(X_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
            y_ehull_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
            unit='eV/atom',learning_rate=0.0005,k=10,patience=80,fig=True)
## ANN 独立测试集直接训练
ANN_DL_byTest(X_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
            y_ehull_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
            X_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),
            y_ehull_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),unit='eV/atom',epochs=120,
            learning_rate=0.0005,batch_size=32,fig=True)
## ANN 独立测试集直接训练
ANN_DL_byTest(X_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
            y_ehull_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
            X_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),
            y_ehull_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),unit='eV/atom',epochs=120,
            learning_rate=0.0005,batch_size=32,fig=True)
#TL
ANN_TL_byCV(X_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
            y_ehull_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
            importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.019_20221228.h5',unit='eV/atom',
            importbaseModel=base_model_B1,
            learning_rate=0.00005,)
model_ehull_D1_B1_1_TL = ANN_TL_byTest(X_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
                                       y_ehull_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
                                       X_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),
                                       y_ehull_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),
                                       importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.019_20221228.h5',
                                      unit='eV/atom',epochs=120,
                                    importbaseModel=base_model_B1,
                                    learning_rate=0.005,)
model_ehull_D1_B1_1_TL_2 = ANN_TL_byTest(X_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
                                       y_ehull_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
                                       X_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),
                                       y_ehull_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),
                                       importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.019_20221228.h5',
                                      unit='eV/atom',epochs=120,
                                    importbaseModel=base_model_B1,
                                    learning_rate=0.0005,)
model_ehull_D1_B1_1_TL_3 = ANN_TL_byTest(X_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
                                       y_ehull_D1_train[y_ehull_D1_train<2.5].reset_index(drop=True),
                                       X_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),
                                       y_ehull_D1_test[y_ehull_D1_test<2.5].reset_index(drop=True),
                                       importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.019_20221228.h5',
                                      unit='eV/atom',epochs=120,
                                    importbaseModel=base_model_B1,
                                    learning_rate=0.005,)
#classification
base_modelClassifier_B1,test_pred_res_B1=ANNClassifier_DL_byTest(X_D1_train,
                        pd.cut(y_Ef_D1_train, bins=[-5, -2.0,10], labels=[1,0]),
                        X_D1_test,
                        pd.cut(y_Ef_D1_test, bins=[-5, -2.0,10], labels=[1,0]),
                        unit='eV/atom',epochs=120,
                        learning_rate=0.0005,batch_size=32,fig=False)
model_ehull_D1_GBRT_DL_1 = GBRTClassifier_DL_byTest(X_D1_train,
                df_D1_train['e_above_hull_class'],
                X_D1_test,
                df_D1_test['e_above_hull_class'],
                 params = {'n_estimators': 100,'subsample': 0.9, 'max_depth':5,
                   'max_features':4,'random_state':1},unit='eV/atom',fig=False)
model_ehull_D1_GBRT_DL_1 = GBRTClassifier_DL_byTest(X_D1_train,
                df_D1_train['e_above_hull_class'],
                X_D1_test,
                df_D1_test['e_above_hull_class'],
                 params = {'n_estimators': 100,'subsample': 0.9, 'max_depth':5,
                   'max_features':4,'random_state':1},unit='eV/atom',fig=False)
def classif_Fig(y, yp):
    '''
    y: 真实值
    y_pred:预测值
    '''
    '''混淆矩阵绘画'''
    from sklearn.metrics import confusion_matrix  # 导入混淆矩阵函数
    import matplotlib.pyplot as plt  # 导入作图库
    plt.rcParams['font.sans-serif']=['Times New Roman']
    fig = plt.figure(dpi=300) # 设置图片分辨率为 300 dpi
    
    cm = confusion_matrix(y, yp)  # 混淆矩阵
    
    print('混淆矩阵为：\n',cm)
    
    plt.matshow(cm, cmap=plt.cm.Blues)  # 画混淆矩阵图，配色风格使用cm.Greens，更多风格请参考官网。
    plt.colorbar()  # 颜色标签
 
    for x in range(len(cm)):  # 数据标签
        for y in range(len(cm)):
            plt.annotate(cm[x, y], xy=(y, x),verticalalignment='center',horizontalalignment='center')
            # 这边一个重要的易错点就是关于xy需要添加数值的位置的点要换成（y, x），因为矩阵可视化的xy与实际意义上的坐标是相反的
    plt.ylabel('True label')  # 坐标轴标签
    plt.xlabel('Predicted label')  # 坐标轴标签
    plt.show
classif_Fig(df_D1_test['e_above_hull_class'], model_ehull_D1_GBRT_DL_1.predict(X_D1_test))
def classHot(test_pred_res):
    lis = []
    for i in test_pred_res:
        if i[0]>=0.5:
            lis.append(1)
        else:
            lis.append(0)
    return lis

classif_Fig(df_D1_test['e_above_hull_class'], classHot(test_pred_res))
classif_Fig(pd.cut(y_Ef_D1_test, bins=[-5, -2.0,10], labels=[1,0]), classHot(test_pred_res_B1))
classif_Fig(df_D1_test['e_above_hull_class'], classHot(test_pred_res_TL_1))
model_ehull_D1_ANN_DL_1,test_pred_res = ANNClassifier_DL_byTest(X_D1_train,
            df_D1_train['e_above_hull_class'],
            X_D1_test,
            df_D1_test['e_above_hull_class'],
            unit='eV/atom',epochs=100,
            learning_rate=0.005,batch_size=32,fig=False)
model_ehull_D1_ANN_TL_1,test_pred_res_TL_1=ANNClassifier_TL_byTest(X_D1_train,
            df_D1_train['e_above_hull_class'],
            X_D1_test,
            df_D1_test['e_above_hull_class'],
            importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.019_20221228.h5',
            importbaseModel=base_modelClassifier_B1,
            unit='eV/atom',epochs=120,
            learning_rate=0.005,batch_size=32,fig=False)

In [None]:
#buik modulus
## GBRT交叉验证直接训练
GBRT_DL_byCV(X_D1_B,y_B_D1,
             params = {'n_estimators': 30,'subsample': 0.9, 'max_depth':4,
            'max_features':4,'random_state':1},
             unit='GPa',fig=True,k=10)
## GBRT 留一法直接训练
GBRT_DL_byLOO(X_D1_B,y_B_D1,
             params = {'n_estimators': 30,'subsample': 0.9, 'max_depth':4,
            'max_features':4,'random_state':1},
             unit='GPa',fig=True)
## GBRT 独立测试集直接训练 
GBRT_DL_byTest(X_D1_B_train,y_B_D1_train,X_D1_B_test,y_B_D1_test,
               params = {'n_estimators': 30,'subsample': 0.9, 'max_depth':4,
                'max_features':4,'random_state':1},
               unit='GPa',fig=True)
## ANN 交叉验证直接训练
ANN_DL_byCV(X_D1_B,y_B_D1,unit='GPa',k=10,patience=80,fig=True)
## ANN 留一法直接训练
ANN_DL_byLOO(X_D1_B,y_B_D1,unit='GPa',patience=80,fig=True)
## ANN 独立测试集直接训练
ANN_DL_byTest(X_D1_G_train,y_G_D1_train,X_D1_G_test,y_G_D1_test,unit='GPa',epochs=120,learning_rate=0.00005,batch_size=32,fig=True)
## ANN 独立测试集直接训练20230108
ANN_DL_byTest(X_D1_G_train,y_G_D1_train,X_D1_G_test,y_G_D1_test,unit='GPa',
              epochs=120,learning_rate=0.0001,batch_size=32,fig=True)
##TL
## ANN 交叉验证迁移学习
ANN_TL_byCV(X_D1_B,y_B_D1,
            importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.0251_TLbest_20221231.h5',
            unit='GPa',
            learning_rate=0.00001,)
## ANN 留一法迁移学习
ANN_TL_byLOO(X_D1_B,y_B_D1,
            importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.0251_TLbest_20221231.h5',
            unit='GPa',
            learning_rate=0.00001,)
## ANN 独立测试集迁移学习123 51 20230108 
model_B_D1_B1_1_TL_5 = ANN_TL_byTest(X_D1_B_train,y_B_D1_train,X_D1_B_test,y_B_D1_test,importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.019_20221228.h5',
              unit='GPa',epochs=120,
#              importbaseModel=base_model_B1,
            learning_rate=0.00005,)

In [None]:
#shear modulus
## GBRT交叉验证直接训练
GBRT_DL_byCV(X_D1_G,y_G_D1,
             params = {'n_estimators': 30,'subsample': 0.9, 'max_depth':4,
            'max_features':4,'random_state':1},
             unit='GPa',fig=True,k=10)
## GBRT 留一法直接训练
GBRT_DL_byLOO(X_D1_G,y_G_D1,
             params = {'n_estimators': 30,'subsample': 0.9, 'max_depth':4,
            'max_features':4,'random_state':1},
             unit='GPa',fig=True)
## GBRT 独立测试集直接训练
GBRT_DL_byTest(X_D1_G_train,y_G_D1_train,X_D1_G_test,y_G_D1_test,
               params = {'n_estimators': 30,'subsample': 0.9, 'max_depth':4,
                'max_features':4,'random_state':1},
               unit='GPa',fig=True)
## ANN 交叉验证直接训练
ANN_DL_byCV(X_D1_G,y_G_D1,unit='GPa',k=10,patience=80,fig=True)
## ANN 留一法直接训练
ANN_DL_byLOO(X_D1_G,y_G_D1,unit='GPa',patience=80,fig=True)
## ANN 独立测试集直接训练
ANN_DL_byTest(X_D1_G_train,y_G_D1_train,X_D1_G_test,y_G_D1_test,unit='GPa',epochs=120,learning_rate=0.00005,batch_size=32,fig=True)
import os
def set_seeds(seed=12):
    random_seed = 12 
    tf.random.set_seed(random_seed )  # set random seed for tensorflow-cpu
    os.environ['TF_DETERMINISTIC_OPS'] = '1' # set random seed for tensorflow-gpu

set_seeds()
ANN_TL_byCV(X_D1_G,y_G_D1,
            importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.0251_TLbest_20221231.h5',
            unit='GPa',
            importbaseModel=base_model_B1,
            learning_rate=0.00005,)
ANN_TL_byLOO(X_D1_G,y_G_D1,
            importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.0251_TLbest_20221231.h5',
            unit='GPa',
            importbaseModel=base_model_B1,
            learning_rate=0.00005,)
##continuous Transfer
model_G_D1_B1_1_TL = ANN_TL_byTest(X_D1_G_train,y_G_D1_train,X_D1_G_test,y_G_D1_test,importbaseModelPath='保存模型/model_B_D1_B1_1_TL.h5',
              unit='GPa',epochs=30,
#             importbaseModel=model_B_D1_B1_1_TL_3,
            learning_rate=0.0005,)
model_G_D1_B1_1_TL.save('保存模型/model_G_D1_B1_1_TL_20230310.h5')
model_G_D1_B1_1_TL_2 = ANN_TL_byTest(X_D1_G_train,y_G_D1_train,X_D1_G_test,y_G_D1_test,importbaseModelPath='保存模型/model_B_D1_B1_1_TL.h5',
              unit='GPa',epochs=30,
#             importbaseModel=model_B_D1_B1_1_TL_3,
      model_G_D1_B1_1_TL = ANN_TL_byTest(pd.concat([X_D1_G_train,X_D5_G_train]).reset_index(drop=True),
                                   pd.concat([y_G_D1_train,y_G_D5_train]).reset_index(drop=True),
                                   X_D1_G_test,
                                   y_G_D1_test,
                                   importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.025_20221228.h5',
                                  unit='GPa',epochs=100,
                                importbaseModel=base_model_B1,
                                learning_rate=0.00005,)      learning_rate=0.0005,)
model_G_D1_B1_1_TL_5 = ANN_TL_byTest(X_D1_G_train,y_G_D1_train,X_D1_G_test,y_G_D1_test,importbaseModelPath='保存模型/halide_double_perovskite_Ef_132magpei_5layers_0.025_20221228.h5',
              unit='GPa',epochs=120,
            importbaseModel=model_B_D1_B1_1_TL_6,
            learning_rate=0.00005,)

In [None]:
#screening
df_predictSet = pd.read_excel('表数据/df_predictSet.xlsx')
df_predictSet = df_predictSet.drop(columns='Unnamed: 0')
df_predictSet
df_D1_predict_t = screen_by_ToleranceFactor(df_predictSet.reset_index(drop=True))
df_D1_predict_t
def drop_TrainData_from_predictData(df,df_1):  
    import re
    new_formula_list = []
    for formu in df['pretty_formula']:
        div = re.compile(r"([A-Z]{1}[a-z]*)(\d*)")
        ele_list = div.findall(formu)
        if len(ele_list)==4:
            new_formula = ele_list[0][0]+ele_list[0][1]+ele_list[2][0]+\
            ele_list[2][1]+ele_list[1][0]+ele_list[1][1]+ele_list[3][0]+ele_list[3][1]
            new_formula_list.append(new_formula)
    df_new_formula = pd.DataFrame(new_formula_list,columns=['pretty_formula'])

    total_formula = pd.concat([pd.DataFrame(df['pretty_formula']),df_new_formula],axis=0)
    index_list = []
    for index, formula in df_1['formula'].items():
        if formula not in list(total_formula['pretty_formula']):
            index_list.append(index)
    return df_1.loc[index_list,:]
df_D1_predict_t_del=drop_TrainData_from_predictData(df_D1,df_D1_predict_t)
df_D1_predict_t_del
def predict_B_by_ANN(x_train,x_test,ANN_model):
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    y_predict = ANN_model.predict(x_test_scaled)
    return pd.DataFrame(y_predict,columns=['B_predict(GPa)'])
predict_B_by_ANN(X_D1_B_train,df_predictSet[df_predictSet['formula']=='Cs2LiVCl6'].loc[:,'MagpieData minimum Number':'MagpieData mode SpaceGroupNumber'],
                 model_B_D1_B1_1_TL_2 )
def predict_G_by_ANN(x_train,x_test,ANN_model):
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    y_predict = ANN_model.predict(x_test_scaled)
    return pd.DataFrame(y_predict,columns=['G_predict(GPa)'])
predict_G_by_ANN(X_D1_G_train,df_predictSet[df_predictSet['formula']=='Cs2LiVCl6'].loc[:,'MagpieData minimum Number':'MagpieData mode SpaceGroupNumber'],
                 model_G_D1_B1_1_TL)
def predict_B_by_ANN(x_train,x_test,ANN_model):
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    y_predict = ANN_model.predict(x_test_scaled)
    return pd.DataFrame(y_predict,columns=['B_predict(GPa)'])
df_D1_predict_B=pd.concat([ df_predictSet,predict_B_by_ANN(X_D1_G_train,df_predictSet.loc[:,'MagpieData minimum Number':'MagpieData mode SpaceGroupNumber'],
                 keras.models.load_model('保存模型/model_B_D1_B1_1_TL_diff=7_R2=0.94_20230310.h5') )],axis=1)
df_D1_predict_B
def predict_G_by_ANN(x_train,x_test,ANN_model):
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    y_predict = ANN_model.predict(x_test_scaled)
    return pd.DataFrame(y_predict,columns=['G_predict(GPa)'])
df_D1_predict_B_G=pd.concat([ df_D1_predict_B,predict_G_by_ANN(X_D1_G_train,df_D1_predict_B.loc[:,'MagpieData minimum Number':'MagpieData mode SpaceGroupNumber'],
                 keras.models.load_model('保存模型/model_G_D1_B1_1_TL_20230310.h5') )],axis=1)
df_D1_predict_B_G
df_D1_predict_B_G.to_excel('表数据/18040体模量剪切模量预测值20230311.xlsx',index=False)
def drop_TrainData_from_predictData(df,df_1):  
    import re
    new_formula_list = []
    for formu in df['pretty_formula']:
        div = re.compile(r"([A-Z]{1}[a-z]*)(\d*)")
        ele_list = div.findall(formu)
        if len(ele_list)==4:
            new_formula = ele_list[0][0]+ele_list[0][1]+ele_list[2][0]+\
            ele_list[2][1]+ele_list[1][0]+ele_list[1][1]+ele_list[3][0]+ele_list[3][1]
            new_formula_list.append(new_formula)
    df_new_formula = pd.DataFrame(new_formula_list,columns=['pretty_formula'])

    total_formula = pd.concat([pd.DataFrame(df['pretty_formula']),df_new_formula],axis=0)
    index_list = []
    for index, formula in df_1['formula'].items():
        if formula not in list(total_formula['pretty_formula']):
            index_list.append(index)
    return df_1.loc[index_list,:]
df_D1_predict_B_G_del=drop_TrainData_from_predictData(df_D1,df_D1_predict_B_G)
df_D1_predict_B_G_del
df_D1_predict_B_G_t_del = screen_by_ToleranceFactor(df_D1_predict_B_G_del.reset_index(drop=True))
df_D1_predict_B_G_t_del
df_D1_predict_B_G_t_del.to_excel('表数据/16529体模量剪切模量预测值新t20230313.xlsx',index=False)


In [None]:
#ehull
## 分类预测
def predict_ehull_by_GBRT(x_test,gbr_model):
    y_predict = gbr_model.predict(x_test)
    return pd.DataFrame(y_predict,columns=['Ehull_predict(eV/atom)'])
df_D1_predict_ehull = pd.concat([df_predictSet,predict_ehull_by_GBRT(df_predictSet.loc[:,'MagpieData minimum Number':'MagpieData mode SpaceGroupNumber'],
                     model_ehull_D1_GBRT_DL_1,)],axis=1)
df_D1_predict_ehull
def predict_Bg_by_ANN(x_train,x_test,ANN_model):
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    y_predict = ANN_model.predict(x_test_scaled)
    return pd.DataFrame(y_predict,columns=['Bg_predict(eV)'])
df_D1_predict_ehull_Eg = pd.concat([df_D1_predict_ehull,predict_Bg_by_ANN(X_D1_train,
                        df_predictSet.loc[:,'MagpieData minimum Number':'MagpieData mode SpaceGroupNumber'],
                         model_Bg_D1_B1_1_TL,
                    )],axis=1)
df_D1_predict_ehull_Eg
df_D1_predict_ehull_Eg_t = screen_by_ToleranceFactor(df_D1_predict_ehull_Eg.reset_index(drop=True))
df_D1_predict_ehull_Eg_t
def predict_B_by_ANN(x_train,x_test,ANN_model):
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    y_predict = ANN_model.predict(x_test_scaled)
    return pd.DataFrame(y_predict,columns=['B_predict(GPa)'])
df_D1_predict_ehull_Eg_t_B=pd.concat([ df_D1_predict_ehull_Eg_t,predict_B_by_ANN(X_D1_B_train,df_D1_predict_ehull_Eg_t.loc[:,'MagpieData minimum Number':'MagpieData mode SpaceGroupNumber'],
                 model_B_D1_B1_1_TL_6 )],axis=1)
df_D1_predict_ehull_Eg_t_B
def predict_G_by_ANN(x_train,x_test,ANN_model):
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    y_predict = ANN_model.predict(x_test_scaled)
    return pd.DataFrame(y_predict,columns=['G_predict(GPa)'])
df_D1_predict_ehull_Eg_t_B_G=pd.concat([ df_D1_predict_ehull_Eg_t_B,predict_G_by_ANN(X_D1_G_train,df_D1_predict_ehull_Eg_t_B.loc[:,'MagpieData minimum Number':'MagpieData mode SpaceGroupNumber'],
                 model_G_D1_B1_1_TL_4 )],axis=1)
df_D1_predict_ehull_Eg_t_B_G
def drop_TrainData_from_predictData(df,df_1):  
    import re
    new_formula_list = []
    for formu in df['pretty_formula']:
        div = re.compile(r"([A-Z]{1}[a-z]*)(\d*)")
        ele_list = div.findall(formu)
        if len(ele_list)==4:
            new_formula = ele_list[0][0]+ele_list[0][1]+ele_list[2][0]+\
            ele_list[2][1]+ele_list[1][0]+ele_list[1][1]+ele_list[3][0]+ele_list[3][1]
            new_formula_list.append(new_formula)
    df_new_formula = pd.DataFrame(new_formula_list,columns=['pretty_formula'])

    total_formula = pd.concat([pd.DataFrame(df['pretty_formula']),df_new_formula],axis=0)
    index_list = []
    for index, formula in df_1['formula'].items():
        if formula not in list(total_formula['pretty_formula']):
            index_list.append(index)
    return df_1.loc[index_list,:]
df_D1_predict_ehull_Eg_t_B_G_del=drop_TrainData_from_predictData(df_D1,df_D1_predict_ehull_Eg_t_B_G)
df_D1_predict_ehull_Eg_t_B_G_del
df_D1_predict_ehull_Eg_t_B_G_del[(df_D1_predict_ehull_Eg_t_B_G_del['Ehull_predict(eV/atom)']==1) & 
                         (df_D1_predict_ehull_Eg_t_B_G_del['Bg_predict(eV)']*1.21+0.36>=1) &
                         (df_D1_predict_ehull_Eg_t_B_G_del['Bg_predict(eV)']*1.21+0.36<=1.7) &
                        (df_D1_predict_ehull_Eg_t_B_G_del['new_t']<4.18) &
                         (df_D1_predict_ehull_Eg_t_B_G_del['new_t']>0) 
                         & (df_D1_predict_ehull_Eg_t_B_G_del['G_predict(GPa)']/df_D1_predict_ehull_Eg_t_B_G_del['B_predict(GPa)']<0.2)
                         
                        ]
df_D1_predict_ehull_Eg_t_B_G_del[(df_D1_predict_ehull_Eg_t_B_G_del['Ehull_predict(eV/atom)']==1) & 
                         (df_D1_predict_ehull_Eg_t_B_G_del['Bg_predict(eV)']*1.21+0.36>=1) &
                         (df_D1_predict_ehull_Eg_t_B_G_del['Bg_predict(eV)']*1.21+0.36<=1.7) &
                        (df_D1_predict_ehull_Eg_t_B_G_del['new_t']<4.18) &
                         (df_D1_predict_ehull_Eg_t_B_G_del['new_t']>0) 
                         & ((df_D1_predict_ehull_Eg_t_B_G_del['G_predict(GPa)']/df_D1_predict_ehull_Eg_t_B_G_del['B_predict(GPa)'])<0.3)
                         
                        ]
df_D1_predict_ehull_Eg_t_B_G_del[(df_D1_predict_ehull_Eg_t_B_G_del['Ehull_predict(eV/atom)']==1) & 
                         (df_D1_predict_ehull_Eg_t_B_G_del['Bg_predict(eV)']*1.21+0.36>=1) &
                         (df_D1_predict_ehull_Eg_t_B_G_del['Bg_predict(eV)']*1.21+0.36<=1.7) &
                        (df_D1_predict_ehull_Eg_t_B_G_del['t']<4.18) &
                         (df_D1_predict_ehull_Eg_t_B_G_del['t']>0) 
                         & (df_D1_predict_ehull_Eg_t_B_G_del['G_predict(GPa)']/df_D1_predict_ehull_Eg_t_B_G_del['B_predict(GPa)']<0.2)
                         
                        ].loc[:,['formula','Bg_predict(eV)']]
df_D1_predict_ehull_Eg_t_B_G[df_D1_predict_ehull_Eg_t_B_G['formula']=='Cs2AgVBr6'].loc[:,['formula','Bg_predict(eV)']]
df_D1_predict_ehull_Eg_t.to_excel('表数据/卤素双钙ehull带隙结构因子筛选结果16529条20230108.xlsx')
df_D1_predict_ehull[df_D1_predict_ehull['Ehull_predict(eV/atom)']==1]

In [None]:
#bandgap
## 回归预测
def predict_Bg_by_ANN(x_train,x_test,ANN_model):
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    y_predict = ANN_model.predict(x_test_scaled)
    return pd.DataFrame(y_predict,columns=['Bg_predict(eV)'])
df_D1_predict_Eg = pd.concat([df_predictSet,predict_Bg_by_ANN(X_D1_train,
                        df_predictSet.loc[:,'MagpieData minimum Number':'MagpieData mode SpaceGroupNumber'],
                         model_Bg_D1_B1_1_TL,
                    )],axis=1)
df_D1_screen_ehull_to_Bg = df_D1_predict_Eg[df_D1_predict_ehull['Ehull_predict(eV/atom)']==1][df_D1_predict_Eg[df_D1_predict_ehull['Ehull_predict(eV/atom)']==1]['Bg_predict(eV)']<2.5][df_D1_predict_Eg[df_D1_predict_ehull['Ehull_predict(eV/atom)']==1][df_D1_predict_Eg[df_D1_predict_ehull['Ehull_predict(eV/atom)']==1]['Bg_predict(eV)']<2.5]['Bg_predict(eV)']>0.1]
df_D1_screen_ehull_to_Bg
def drop_TrainData_from_predictData():
    index_list = []
    for index, formula in df_D1_screen_ehull_to_Bg['formula'].items():
        if formula not in list(df_D1['pretty_formula']):
            index_list.append(index)
    return df_D1_screen_ehull_to_Bg.loc[index_list,:]
                       
drop_TrainData_from_predictData()    
df_drop_TrainData_from_predictData = drop_TrainData_from_predictData()

In [None]:
#structure
def screen_by_ToleranceFactor(df):
 
    import re
    import math
    import pandas as pd
    from mendeleev import element
    
    def numElement(df):
        
        element_list = []
        for formu in df['formula']:
            div = re.compile(r"([A-Z][a-z])|([A-Z])|(\(.*\))")
            ele_list = div.findall(formu)
            for el in ele_list:
                for e in el:
                    if e!='' and e not in ele_list:
                        element_list.append(e)
        element_list_100 = set(element_list)

        return element_list_100
    element_list = numElement(df=df)

    df_single_ele_pre = pd.DataFrame(list(element_list),columns=['element'])

    def getFeatures(df,to_path):
        #导入本地特征表
        path1 = r"表数据/Supplementary Data 1-Elemental Properties of Atoms.csv"
        df_elem_proper = pd.read_csv(path1)
        df_elem_proper_t = df_elem_proper.T
        df_elem_proper_t.columns = list(df_elem_proper.symbol)

        #批量创建空数组
        feature_list_name_all = ['IR']
        for var in feature_list_name_all:
            # 为每个变量名创建一个空列表
            # 例如：EN_pauling = []
            globals()[var] = []
        for index, elem in df.iloc[:, 0].iteritems():
            IR.append(df_elem_proper_t.loc["ionic_radius", elem])#离子半径
        # 复制一份原输入的df
        column1 = [0]
        df_copy = df.iloc[:, column1]
        feature_list_all = []
        for var in feature_list_name_all:
            feature_list_all.append(globals()[var])
        for index, name in enumerate(feature_list_name_all):  # 遍历数组返回索引及数值

            df_copy[name] = feature_list_all[index]
    #     df_copy.to_excel(to_path)
        display(df_copy)
        print("成功！")
        return df_copy
        # path = r'D:\稀贵金属机器学习\单元素.csv'

    df_physical_single_pre=getFeatures(df_single_ele_pre, '')

    def createPhysicalFeatures(df_train,df_physical_single,to_excel_path):

        df_train_formula = df_train['formula'] 
        #元素物理特征生成表中用第一列元素名作为index
        df_physical_single = df_physical_single.set_index('element')
        # 匹配元素和原子个数的正则表达式
        element_and_count_pattern = r"([A-Z]{1}[a-z]*)(\d*)"
        A,B,X = [],[],[]
        A_count,B_count,X_count = [],[],[]
        for key,value in df_train_formula.iteritems():
            # 使用正则表达式查找化学式中的所有元素和原子个数
            element_and_count_list = re.findall(element_and_count_pattern, value)
            print(element_and_count_list)
            A.append(element_and_count_list[0][0])
            B.append(element_and_count_list[1][0])
            X.append(element_and_count_list[2][0])
            A_count.append(element_and_count_list[0][1])
            B_count.append(element_and_count_list[1][1])
            X_count.append(element_and_count_list[2][1])
        # 给原子个数为空字符串的元素赋值为1
        for i in range(len(A_count)):
            A_count[i]=1 if A_count[i]=='' else A_count[i]
            B_count[i]=1 if B_count[i]=='' else B_count[i]
            X_count[i]=1 if X_count[i]=='' else X_count[i]
        #生成物理特征，不考虑原子个数
        #

        for key,value in df_physical_single.iteritems():
            A_feat = value[A].reset_index(drop=True)
            B_feat = value[B].reset_index(drop=True)
            X_feat = value[X].reset_index(drop=True)
            sub_A_X = A_feat - X_feat
            sub_B_X = B_feat - X_feat
            sub_A_B = A_feat - B_feat
            if key == 'IR':
                #容忍因子 公式 t = (rA+rX)/根号2(rB+rX)
                # 新容忍因子
                new_t = (X_feat/B_feat)+(A_feat/B_feat)/((A_feat/B_feat).apply(np.log))-1
                t = (A_feat+X_feat)/((2**(1/2))*(B_feat+X_feat))
                # 八面体因子 公式 u = rB/rX
                u = B_feat/X_feat
                # 当前特征df
                df_current_feat = pd.DataFrame({str('new_t'):new_t,str('t'):t,str('u'):u,str(key+'B-X'):sub_B_X,
                                                str(key+'A-X'):sub_A_X,str(key+'A-B'):sub_A_B,
                                                str(key+'_X'):X_feat,str(key+'_B'):B_feat,
                                                str(key+'_A'):A_feat
                                        })
            else:
                # 当前特征df
                df_current_feat = pd.DataFrame({str(key+'B-X'):sub_B_X,
                                                str(key+'A-X'):sub_A_X,str(key+'A-B'):sub_A_B,
                                                str(key+'_X'):X_feat,str(key+'_B'):B_feat,
                                                str(key+'_A'):A_feat
                                            })
            #拼接当前特征df到原表
            df_train_new = pd.concat([df_train,df_current_feat],axis=1)
        display(df_train_new)
    #     df_train.to_excel(to_excel_path)
        return df_train_new

    df_t_pre = createPhysicalFeatures(df,df_physical_single_pre,'')      
    return df_t_pre  

# df_screen_by_ToleranceFactor = screen_by_ToleranceFactor(df_drop_TrainData_from_predictData.reset_index(drop=True))
df_screen_by_ToleranceFactor[df_screen_by_ToleranceFactor['t']>0][df_screen_by_ToleranceFactor[df_screen_by_ToleranceFactor['t']>0]['t']<4.18]
df_screen_by_ToleranceFactor[df_screen_by_ToleranceFactor['t']>0][df_screen_by_ToleranceFactor[df_screen_by_ToleranceFactor['t']>0]['t']<4.18]['Bg_predict(eV)']*1.21+0.36<
display(df_D1_predict_Eg[df_D1_predict_Eg['formula']=='Cs2AgSbBr6']['Bg_predict(eV)'])
display(df_D1[df_D1['full_formula']=='K2Ag1Sb1Cl6']['band_gap'])
df_D1_screen_ehull_to_Bg[df_D1_screen_ehull_to_Bg['formula']=='Cs2KCoBr6']