In [None]:
import torch.nn as nn
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
torch.manual_seed(1)
np.random.seed(1)
random.seed(1)       

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

class FeatureDataset(Dataset):
    '''
    Args: x is a 2D numpy array [x_size, x_features]
    '''
    def __init__(self, x):
        self.x = x
    
    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self, idx):
        return torch.FloatTensor(self.x[idx])

    def getBatch(self, idxs=[]):
        if idxs == None:
            return idxs
        else:
            x_features = []
            for i in idxs:
                x_features.append(self.__getitem__(i))
            return torch.FloatTensor(x_features)

def normalizing_data(data, seed=1):  
    
    composition = data[['Ba', 'Ca', 'Sr', 'Ti', 'Zr','Sn', 'Hf']]
    descriptors = data[['W', 'EI', 'EA', 'μ']]
    
    
    min_max_scaler = MinMaxScaler()
    normalized_composition = min_max_scaler.fit_transform(composition)
    normalized_descriptors = min_max_scaler.fit_transform(descriptors)
    
    
    normalized_composition_df = pd.DataFrame(normalized_composition, columns=composition.columns)
    normalized_descriptors_df = pd.DataFrame(normalized_descriptors, columns=descriptors.columns)  
    
   
    x = pd.concat([normalized_composition_df, normalized_descriptors_df], axis=1)
    print(x)
    
    y = data[['d33(pC/N)']] 
    print(y)

    
    x = torch.FloatTensor(x.values)
    y = torch.FloatTensor(y.values)

    if torch.cuda.is_available():
        x = x.cuda()
        y = y.cuda()
    
    
    train_features, test_features, train_labels, test_labels = train_test_split(x, y, test_size=0.2, random_state=seed)
    print(y)
    return x, y, train_features, test_features, train_labels, test_labels

In [None]:
import os
import time
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
import numpy as np
import matplotlib.pyplot as plt
import datetime
import pandas as pd
import random
import torch
from sklearn.metrics import mean_absolute_percentage_error


def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(1)
starttime = datetime.datetime.now()


t = time.localtime()
model_name = 'd33_inference_GBDT'
file_name = '{}.xlsx'.format(model_name)
data =  pd.read_excel('data-1.xlsx')


x_all, y_all, train_features, test_features, train_labels, test_labels = normalizing_data(data, seed=1)
train_features, test_features = train_features.cpu().data.numpy(), test_features.cpu().data.numpy()
train_labels, test_labels = train_labels.cpu().data.numpy(), test_labels.cpu().data.numpy()
train_labels, test_labels = train_labels.reshape(-1), test_labels.reshape(-1)


def train_model(num_leaves,
                min_child_samples,
                learning_rate,
                n_estimators, 
                max_bin,
                colsample_bytree, 
                subsample, 
                max_depth, 
                reg_alpha,
                reg_lambda,
                min_split_gain,
                min_child_weight
               ):
    params = {
        "num_leaves": int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'learning_rate': learning_rate,
        'n_estimators': int(round(n_estimators)),
        'max_bin': int(round(max_bin)),
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'subsample': max(min(subsample, 1), 0),
        'max_depth': int(round(max_depth)),
        'reg_alpha': max(reg_alpha, 0),
        'reg_lambda': max(reg_lambda, 0),
        'min_split_gain': min_split_gain,
        'min_child_weight': min_child_weight,
        'verbose': -1
    }
    model = LGBMRegressor(**params)
    model.fit(train_features, train_labels)
    y_pred_train = model.predict(train_features)
    y_pred_test = model.predict(test_features)
    train_mape = mean_absolute_percentage_error(train_labels, y_pred_train)
    test_mape = mean_absolute_percentage_error(test_labels, y_pred_test)
    print("train_mape:", train_mape)
    print("test_mape:", test_mape)
    
    error = -np.mean(np.abs((test_labels - y_pred_test) / test_labels))
    return error


bounds = {
    'num_leaves': (20, 150),
    'min_child_samples': (5, 50),
    'learning_rate': (0.01, 0.1),
    'n_estimators': (200, 1000),
    'max_bin': (50, 300),
    'colsample_bytree': (0.4, 1),
    'subsample': (0.3, 1.0),
    'max_depth': (1, 15),
    'reg_alpha': (0, 10),
    'reg_lambda': (0.1, 20),
    'min_split_gain': (0, 0.5),
    'min_child_weight': (1, 50)
}

optimizer = BayesianOptimization(
    f=train_model,
    pbounds=bounds,
    random_state=1,
)


optimizer.maximize(init_points=100, n_iter=150)


result_list = []

for res in optimizer.res:
    result_list.append(pd.DataFrame({
        'target': [res['target']],
        'colsample_bytree': [res['params']['colsample_bytree']],
        'learning_rate': [res['params']['learning_rate']],
        'max_bin': [res['params']['max_bin']],
        'max_depth': [res['params']['max_depth']],
        'min_child_samples': [res['params']['min_child_samples']],
        'min_child_weight': [res['params']['min_child_weight']],
        'min_split_gain': [res['params']['min_split_gain']],
        'n_estimators': [res['params']['n_estimators']],
        'num_leaves': [res['params']['num_leaves']],
        'reg_alpha': [res['params']['reg_alpha']],
        'reg_lambda': [res['params']['reg_lambda']],
        'subsample': [res['params']['subsample']]
    }))


table = pd.concat(result_list, ignore_index=True)


best_result = pd.DataFrame({
    'target': [optimizer.max['target']],
    'colsample_bytree': [optimizer.max['params']['colsample_bytree']],
    'learning_rate': [optimizer.max['params']['learning_rate']],
    'max_bin': [optimizer.max['params']['max_bin']],
    'max_depth': [optimizer.max['params']['max_depth']],
    'min_child_samples': [optimizer.max['params']['min_child_samples']],
    'min_child_weight': [optimizer.max['params']['min_child_weight']],
    'min_split_gain': [optimizer.max['params']['min_split_gain']],
    'n_estimators': [optimizer.max['params']['n_estimators']],
    'num_leaves': [optimizer.max['params']['num_leaves']],
    'reg_alpha': [optimizer.max['params']['reg_alpha']],
    'reg_lambda': [optimizer.max['params']['reg_lambda']],
    'subsample': [optimizer.max['params']['subsample']]
})


table = pd.concat([table, best_result], ignore_index=True)

table.insert(0, 'Index', range(0, len(table)))


table.to_excel(file_name, index=False)

endtime = datetime.datetime.now()
print('Running time: {}'.format(endtime - starttime))
print(table)

In [None]:
import os
import time
import torch
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
import lightgbm as lgb
import joblib


x_all, y_all, train_features, test_features, train_labels, test_labels = normalizing_data(data, seed=1)
train_features, test_features = train_features.cpu().data.numpy(), test_features.cpu().data.numpy()
train_labels, test_labels = train_labels.cpu().data.numpy(), test_labels.cpu().data.numpy()
train_labels, test_labels = train_labels.reshape(-1), test_labels.reshape(-1)


results_df = pd.DataFrame(columns=['Iteration', 'target', 'R2_Score_test', 'R2_Score_train', 'Train_Loss', 'Test_Loss', 'Figure_Path_Train', 'Figure_Path_Test', 'Figure_Path_All', 'Loss_Path'])

set_random_seed(1)

for mm in range(0, 251):
    set_random_seed(1)
    target = pd.read_excel('d33_inference_GBDT.xlsx')
    
    
    tg = target.at[mm, 'target']
    colsample_bytree = target.at[mm, 'colsample_bytree']
    learning_rate = target.at[mm, 'learning_rate']
    max_bin = target.at[mm, 'max_bin']
    max_depth = target.at[mm, 'max_depth']
    min_child_samples = target.at[mm, 'min_child_samples']
    min_child_weight = target.at[mm, 'min_child_weight']
    min_split_gain = target.at[mm, 'min_split_gain']
    n_estimators = target.at[mm, 'n_estimators']
    num_leaves = target.at[mm, 'num_leaves']
    reg_alpha = target.at[mm, 'reg_alpha']
    reg_lambda = target.at[mm, 'reg_lambda']
    subsample = target.at[mm, 'subsample']
    
    
    params = {
        "num_leaves": int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'learning_rate': learning_rate,
        'n_estimators': int(round(n_estimators)),
        'max_bin': int(round(max_bin)),
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'subsample': max(min(subsample, 1), 0),
        'max_depth': int(round(max_depth)),
        'reg_lambda': max(reg_lambda, 0),
        'reg_alpha': max(reg_alpha, 0),
        'min_split_gain': min_split_gain,
        'min_child_weight': min_child_weight,
        'objective': 'regression',
        'verbose': -1
    }
    
    
    evals_result = {}
    model = LGBMRegressor(metric='mape', **params)
    model.fit(train_features, train_labels, eval_set=[(test_features, test_labels), (train_features, train_labels)], 
              eval_names=['test', 'train'], eval_metric='mape', callbacks=[lgb.record_evaluation(evals_result)])
    
    
    loss_dir = 'Results/STU_GBDT_BO(100+150)_1'
    figures_dir = os.path.join(loss_dir, 'Figures')
    
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)
    
    model_save_path = f'Results/STU_GBDT_BO(100+150)_1/{mm}-seed_1.pkl'
    joblib.dump(model, model_save_path)  
    
    
    plt.figure()
    plt.plot(evals_result['train']['mape'], label='Train MAPE')
    plt.plot(evals_result['test']['mape'], label='Test MAPE')
    plt.title('MAPE during Training')
    plt.xlabel('Iterations')
    plt.ylabel('MAPE')
    plt.legend()
    plt.text(len(evals_result['train']['mape']) - 1, evals_result['test']['mape'][-1], 
             f'Target={tg:.4f}', fontsize=12, color='red', ha='right')
    
    
    plt.savefig(f'{figures_dir}/{mm}_GBDT_training_history.png', format='png', dpi=300)
    plt.close()
    
    
    train_losses = evals_result['train']['mape']
    test_losses = evals_result['test']['mape']
    final_train_loss = train_losses[-1]
    final_test_loss = test_losses[-1]

    
    epochs = range(1, len(train_losses) + 1)
    df_losses = pd.DataFrame({
        'Epoch': epochs,
        'Train Loss': train_losses,
        'Test Loss': test_losses
    })
    
    excel_path = f'{loss_dir}/{mm}_GBDT_loss_data.xlsx'
    df_losses.to_excel(excel_path, index=False)

    
    predict_test = model.predict(test_features)
    current_r2_test = r2_score(test_labels, predict_test)
    
    predict_train = model.predict(train_features)
    current_r2_train = r2_score(train_labels, predict_train)
    
    
    train_prediction_df = pd.DataFrame({
        'Actual': train_labels,
        'Predicted': predict_train
    })
    train_prediction_path = f'{loss_dir}/{mm}_train_predictions.xlsx'
    train_prediction_df.to_excel(train_prediction_path, index=False)

    test_prediction_df = pd.DataFrame({
        'Actual': test_labels,
        'Predicted': predict_test
    })
    test_prediction_path = f'{loss_dir}/{mm}_test_predictions.xlsx'
    test_prediction_df.to_excel(test_prediction_path, index=False)

    
    fig_name_train = f'{figures_dir}/{mm}_GBDT_experiment_vs_pred_train.png'
    plt.figure()
    sns.regplot(x=predict_train, y=train_labels, color='blue')
    plt.title('Train Prediction vs Actual')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.text(min(predict_train), max(train_labels), f'R²={current_r2_train:.4f}', color='blue')
    plt.savefig(fig_name_train, format='png', dpi=300)
    plt.close()
    
    fig_name_test = f'{figures_dir}/{mm}_GBDT_experiment_vs_pred_test.png'
    plt.figure()
    sns.regplot(x=predict_test, y=test_labels, color='red')
    plt.title('Test Prediction vs Actual')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.text(min(predict_test), max(test_labels), f'R²={current_r2_test:.4f}', color='red')
    plt.savefig(fig_name_test, format='png', dpi=300)
    plt.close()

    
    fig_name_all = f'{figures_dir}/{mm}_GBDT_experiment_vs_pred_all.png'
    plt.figure()
    sns.regplot(x=predict_train, y=train_labels, color='blue', label='Train')
    sns.regplot(x=predict_test, y=test_labels, color='red', label='Test')
    plt.legend()
    current_r2_all = r2_score(np.concatenate([train_labels, test_labels]), np.concatenate([predict_train, predict_test]))
    plt.text(min(np.concatenate([predict_train, predict_test])), max(np.concatenate([train_labels, test_labels])), f'R²={current_r2_all:.4f}', color='green')
    plt.title('All Prediction vs Actual')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(fig_name_all, format='png', dpi=300)
    plt.close()

    
    results_df = pd.concat([results_df, pd.DataFrame({
        'Iteration': [mm],
        'target': [tg],
        'R2_Score_test': [current_r2_test],
        'R2_Score_train': [current_r2_train],
        'Train_Loss': [final_train_loss],  
        'Test_Loss': [final_test_loss],    
        'Figure_Path_Train': [fig_name_train],
        'Figure_Path_Test': [fig_name_test],
        'Figure_Path_All': [fig_name_all],
        'Loss_Path': [excel_path],
        'Train_Prediction_Path': [train_prediction_path],  
        'Test_Prediction_Path': [test_prediction_path]     
    })], ignore_index=True)


results_df.to_csv(f'{loss_dir}/results_summary_GBDT.csv', index=False)
