In [None]:
import torch.nn as nn
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
torch.manual_seed(1)
np.random.seed(1)
random.seed(1)       

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('BatchNorm') != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

class FeatureDataset(Dataset):
    '''
    Args: x is a 2D numpy array [x_size, x_features]
    '''
    def __init__(self, x):
        self.x = x
    
    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self, idx):
        return torch.FloatTensor(self.x[idx])

    def getBatch(self, idxs=[]):
        if idxs == None:
            return idxs
        else:
            x_features = []
            for i in idxs:
                x_features.append(self.__getitem__(i))
            return torch.FloatTensor(x_features)

def normalizing_data(data, seed=1):  
   
    composition = data[['Ba', 'Ca', 'Sr', 'Ti', 'Zr','Sn', 'Hf']]
    descriptors = data[['W', 'EI', 'EA', 'μ']]
    
    
    min_max_scaler = MinMaxScaler()
    normalized_composition = min_max_scaler.fit_transform(composition)
    normalized_descriptors = min_max_scaler.fit_transform(descriptors)
    
   
    normalized_composition_df = pd.DataFrame(normalized_composition, columns=composition.columns)
    normalized_descriptors_df = pd.DataFrame(normalized_descriptors, columns=descriptors.columns)  
    
    
    x = pd.concat([normalized_composition_df, normalized_descriptors_df], axis=1)
    print(x)
    
    y = data[['d33(pC/N)']] 
    print(y)

    
    x = torch.FloatTensor(x.values)
    y = torch.FloatTensor(y.values)

    if torch.cuda.is_available():
        x = x.cuda()
        y = y.cuda()
    
    
    train_features, test_features, train_labels, test_labels = train_test_split(x, y, test_size=0.2, random_state=seed)
    print(y)
    return x, y, train_features, test_features, train_labels, test_labels

In [None]:
import os
import time
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt
import datetime
import pandas as pd
import random
import torch

def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(1)
starttime = datetime.datetime.now()


t = time.localtime()
model_name = 'd33_inference_RandomForest'
file_name = '{}.xlsx'.format(model_name)
data =  pd.read_excel('data-1.xlsx')

x_all, y_all, train_features, test_features, train_labels, test_labels = normalizing_data(data, seed=1)
train_features, test_features = train_features.cpu().data.numpy(), test_features.cpu().data.numpy()
train_labels, test_labels = train_labels.cpu().data.numpy(), test_labels.cpu().data.numpy()
train_labels, test_labels = train_labels.reshape(-1), test_labels.reshape(-1) 


def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))


def train_model(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features):
    params = {
        "n_estimators": int(round(n_estimators)),
        "max_depth": int(round(max_depth)),
        "min_samples_split": int(round(min_samples_split)),
        "min_samples_leaf": int(round(min_samples_leaf)),
        "max_features": max(min(max_features, 1), 0),
        "random_state": 1
    }
    model = RandomForestRegressor(**params)
    model.fit(train_features, train_labels)
    y_pred_train = model.predict(train_features)
    y_pred_test = model.predict(test_features)
    train_mape = mean_absolute_percentage_error(train_labels, y_pred_train)
    test_mape = mean_absolute_percentage_error(test_labels, y_pred_test)
    print("train_mapre:", train_mape)
    print("test_mapre:", test_mape)
    error = -test_mape
    return error


bounds = {
    'n_estimators': (200, 1000), 
    'max_depth': (20, 100), 
    'min_samples_split': (2, 10), 
    'min_samples_leaf': (1, 5), 
    'max_features': (0.3, 1) 
}

optimizer = BayesianOptimization(
    f=train_model,
    pbounds=bounds,
    random_state=1,
)


optimizer.maximize(init_points=100, n_iter=150)


table = pd.DataFrame(columns=['target', 'n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'])
result_list = []

for res in optimizer.res:
    result_list.append(pd.DataFrame({'target': [res['target']],
                                     'n_estimators': [res['params']['n_estimators']],
                                     'max_depth': [res['params']['max_depth']],
                                     'min_samples_split': [res['params']['min_samples_split']],
                                     'min_samples_leaf': [res['params']['min_samples_leaf']],
                                     'max_features': [res['params']['max_features']]}))


table = pd.concat(result_list, ignore_index=True)


best_result = pd.DataFrame({'target': [optimizer.max['target']],
                            'n_estimators': [optimizer.max['params']['n_estimators']],
                            'max_depth': [optimizer.max['params']['max_depth']],
                            'min_samples_split': [optimizer.max['params']['min_samples_split']],
                            'min_samples_leaf': [optimizer.max['params']['min_samples_leaf']],
                            'max_features': [optimizer.max['params']['max_features']]})


table = pd.concat([table, best_result], ignore_index=True)


table.to_excel(file_name)

endtime = datetime.datetime.now()
print('Running time: {}'.format(endtime - starttime))
print(table)

In [None]:
import os
import time
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import torch
import joblib
import seaborn as sns


def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(1)

folder_dir_results = 'Results/STU_RF_BO(100+150)_1'
folder_dir_figures = os.path.join(folder_dir_results, 'Figures')
if not os.path.exists(folder_dir_results):
    os.makedirs(folder_dir_results)
if not os.path.exists(folder_dir_figures):
    os.makedirs(folder_dir_figures)


x_all, y_all, train_features, test_features, train_labels, test_labels = normalizing_data(data, seed=1)
train_features, test_features = train_features.cpu().data.numpy(), test_features.cpu().data.numpy()
train_labels, test_labels = train_labels.cpu().data.numpy(), test_labels.cpu().data.numpy()
train_labels, test_labels = train_labels.reshape(-1), test_labels.reshape(-1)

results_df = pd.DataFrame(columns=['Iteration', 'target', 'R2_Score_test', 'R2_Score_train', 'Train_MAPE', 'Test_MAPE', 'Figure_Path_test', 'Figure_Path_train', 'Figure_Path_all', 'Loss_Path', 'Prediction_Train_Path', 'Prediction_Test_Path'])

for mm in range(0, 251):

    set_random_seed(1)
    target = pd.read_excel('d33_inference_RandomForest.xlsx')
    tg = target.at[mm, 'target']
    n_estimators = int(round(target.at[mm, 'n_estimators']))
    max_depth = int(round(target.at[mm, 'max_depth']))
    min_samples_split = int(round(target.at[mm, 'min_samples_split']))
    min_samples_leaf = int(round(target.at[mm, 'min_samples_leaf']))
    max_features = max(min(target.at[mm, 'max_features'], 1), 0)

    params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features,
        'random_state': 1
    }


    model = RandomForestRegressor(**params)
    model.fit(train_features, train_labels)
    
    model_save_path = f'Results/STU_RF_BO(100+150)_1/{mm}-seed_1.joblib'
    joblib.dump(model, model_save_path)  


    
    predict_train = model.predict(train_features)
    train_mape = mean_absolute_percentage_error(train_labels, predict_train)

    predict_test = model.predict(test_features)
    test_mape = mean_absolute_percentage_error(test_labels, predict_test)

    
    loss_data = pd.DataFrame({'Epoch': [1], 'Train Loss': [train_mape], 'Test Loss': [test_mape]})
    loss_file_path = f'{folder_dir_results}/RF_loss_data_{mm}_seed_1.xlsx'
    loss_data.to_excel(loss_file_path, index=False)

    
    df_prediction_train = pd.DataFrame({'Predicted': predict_train, 'Actual': train_labels})
    prediction_train_path = f'{folder_dir_results}/RF_prediction_train_{mm}_seed_1.xlsx'
    df_prediction_train.to_excel(prediction_train_path, index=False)

    df_prediction_test = pd.DataFrame({'Predicted': predict_test, 'Actual': test_labels})
    prediction_test_path = f'{folder_dir_results}/RF_prediction_test_{mm}_seed_1.xlsx'
    df_prediction_test.to_excel(prediction_test_path, index=False)

    
    plt.figure()
    plt.plot([1], [train_mape], 'bo-', label="Train MAPE")
    plt.plot([1], [test_mape], 'ro-', label="Test MAPE")
    plt.legend()
    plt.title('MAPE during Training')
    plt.xlabel('Epoch')
    plt.ylabel('MAPE')
    plt.text(1, test_mape, f'Target Loss={tg:.4f}', fontdict={'size': 12, 'color': 'red'})
    mape_curve_filename = f'{folder_dir_figures}/{mm}_RF_training_loss_seed_1.png'
    plt.savefig(mape_curve_filename, format='png', dpi=300)
    plt.close()

    
    fig_name_2_train = f'{folder_dir_figures}/{mm}_RF_experiment_vs_pred_train_seed_1.png'
    plt.figure()
    sns.regplot(x=predict_train, y=train_labels, color='blue')
    current_r2_train = r2_score(train_labels, predict_train)
    plt.text(min(predict_train), max(train_labels), f'R²={current_r2_train:.4f}', color='blue')
    plt.title('Train Prediction vs Actual')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(fig_name_2_train, format='png', dpi=300)
    plt.close()

    
    fig_name_2_test = f'{folder_dir_figures}/{mm}_RF_experiment_vs_pred_test_seed_1.png'
    plt.figure()
    sns.regplot(x=predict_test, y=test_labels, color='red')
    current_r2_test = r2_score(test_labels, predict_test)
    plt.text(min(predict_test), max(test_labels), f'R²={current_r2_test:.4f}', color='red')
    plt.title('Test Prediction vs Actual')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(fig_name_2_test, format='png', dpi=300)
    plt.close()

    
    fig_name_2_all = f'{folder_dir_figures}/{mm}_RF_experiment_vs_pred_all_seed_1.png'
    plt.figure()
    sns.regplot(x=predict_train, y=train_labels, color='blue', label="Train")
    sns.regplot(x=predict_test, y=test_labels, color='red', label="Test")
    plt.legend()
    current_r2_all = r2_score(np.concatenate([train_labels, test_labels]), np.concatenate([predict_train, predict_test]))
    plt.text(min(np.concatenate([predict_train, predict_test])), max(np.concatenate([train_labels, test_labels])), f'R²={current_r2_all:.4f}', color='green')
    plt.title('All Prediction vs Actual')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(fig_name_2_all, format='png', dpi=300)
    plt.close()

    
    df_prediction_comparison_all = pd.DataFrame({
        'Predicted (Train)': np.concatenate([predict_train, [np.nan] * len(test_labels)]),
        'Actual (Train)': np.concatenate([train_labels, [np.nan] * len(test_labels)]),
        'Predicted (Test)': np.concatenate([[np.nan] * len(train_labels), predict_test]),
        'Actual (Test)': np.concatenate([[np.nan] * len(train_labels), test_labels])
    })
    prediction_comparison_all_filename = f'{folder_dir_results}/{mm}_RF_experiments_and_prediction_comparison_all_seed_1.xlsx'
    df_prediction_comparison_all.to_excel(prediction_comparison_all_filename, index=False)

    
    results_df = pd.concat([results_df, pd.DataFrame([{
        'Iteration': mm,
        'target': tg,
        'R2_Score_test': current_r2_test,
        'R2_Score_train': current_r2_train,
        'Train_MAPE': train_mape,
        'Test_MAPE': test_mape,
        'Figure_Path_test': fig_name_2_test,
        'Figure_Path_train': fig_name_2_train,
        'Figure_Path_all': fig_name_2_all,
        'Loss_Path': mape_curve_filename,
        'Prediction_Train_Path': prediction_train_path,
        'Prediction_Test_Path': prediction_test_path
    }])], ignore_index=True)


results_summary_filename = f'{folder_dir_results}/results_summary_RF.csv'
results_df.to_csv(results_summary_filename, index=False)