GBDT+NN

In [None]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import torch
import torch.nn as nn
import torch.nn.functional as F
import joblib
import os
import numpy as np


data = pd.read_csv('filtered_decoded_comps.csv', encoding='GBK')
print("Data columns:", data.columns)

def normalizing_data(df_all):
    
    composition = df_all[['Ba', 'Ca', 'Sr', 'Ti', 'Zr','Sn', 'Hf']]
    descriptors = df_all[['W', 'EI', 'EA', 'μ']]
    
    
    composition_scaler = joblib.load('composition_scaler.joblib')
    descriptors_scaler = joblib.load('descriptors_scaler.joblib')
    
    normalized_composition = composition_scaler.transform(composition)
    normalized_descriptors = descriptors_scaler.transform(descriptors)
    
    
    x = pd.concat([pd.DataFrame(normalized_composition, columns=composition.columns),
                   pd.DataFrame(normalized_descriptors, columns=descriptors.columns)], axis=1)
    y = df_all.get('d33(pC/N)')  
    return x, y


def Tree(n):
    model_path = f'Results/STU_GBDT_BO(100+150)_1/{n}-seed_1.pkl'
    if os.path.exists(model_path):
        print(f"Loading pre-trained GBDT model from {model_path}")
        model = joblib.load(model_path)
    else:
        target = pd.read_excel('d33_inference_GBDT.xlsx')
        params = {
            "num_leaves": int(round(target.at[n, 'num_leaves'])),
            'min_child_samples': int(round(target.at[n, 'min_child_samples'])),
            'learning_rate': target.at[n, 'learning_rate'],
            'n_estimators': int(round(target.at[n, 'n_estimators'])),
            'max_bin': int(round(target.at[n, 'max_bin'])),
            'colsample_bytree': target.at[n, 'colsample_bytree'],
            'subsample': target.at[n, 'subsample'],
            'max_depth': int(round(target.at[n, 'max_depth'])),
            'reg_lambda': target.at[n, 'reg_lambda'],
            'reg_alpha': target.at[n, 'reg_alpha'],
            'min_split_gain': target.at[n, 'min_split_gain'],
            'min_child_weight': target.at[n, 'min_child_weight'],
            'objective': 'regression',
            'verbose': -1
        }
        
        print(f"Training new GBDT model for index {n}")
        model = LGBMRegressor(**params)
        x_all, y_all = normalizing_data(data)
        model.fit(x_all, y_all)
        joblib.dump(model, model_path)

    
    x_all, _ = normalizing_data(data)
    return model.predict(x_all)


class Net(nn.Module):  
    def __init__(self, n_feature, n_hidden, n_output, w):
        super(Net, self).__init__()
        self.hidden1 = nn.Linear(n_feature, n_hidden)
        self.hiddens = nn.ModuleList([nn.Linear(n_hidden, n_hidden) for _ in range(w)])
        self.dropout = nn.Dropout(p=0.1)
        self.predict = nn.Linear(n_hidden, n_output)

    def forward(self, x): 
        x = F.relu(self.hidden1(x))
        x = self.dropout(x)
        for hidden_layer in self.hiddens:
            x = F.relu(hidden_layer(x))
            x = self.dropout(x)
        return self.predict(x)


def NN(n):
    target = pd.read_excel('d33_inference_NN.xlsx')
    batch_size = int(target.at[n, 'batch_size'])
    lr = target.at[n, 'lr']
    module__n_hidden = int(target.at[n, 'module__n_hidden'])
    module__w = int(target.at[n, 'module__w'])

    net = Net(n_feature=11, n_hidden=module__n_hidden, n_output=1, w=module__w)
    weight_file = f'Results/STU_NN_BO(100+150)_1/{n}-seed_1.pt'
    print(f"Loading NN weights from file: {weight_file}")
    net.load_state_dict(torch.load(weight_file))
    net.eval()

    
    x_all, _ = normalizing_data(data)
    x_tensor = torch.FloatTensor(x_all.values)
    return net(x_tensor).detach().numpy()


def ensemble_prediction():
    i_values, m_values = [135, 171, 196, 55, 99, 175], [198, 210, 245, 235, 208, 213]
    First_round_Comp_total = pd.DataFrame()

    for idx, (i, m) in enumerate(zip(i_values, m_values)):
        print(f"\n==== Round {idx}: GBDT and NN Prediction ====")
        First_round_Comp_total[f'pred_Z_Tree_{idx}'] = Tree(m)
        First_round_Comp_total[f'pred_Z_NN_{idx}'] = NN(i)

   
    pred_mean = First_round_Comp_total.mean(axis=1)
    pred_std = First_round_Comp_total.std(axis=1)

    Comp = pd.read_csv('filtered_decoded_comps.csv', encoding='GBK')
    if 'pred_Z_mean' in Comp.columns:
        Comp.drop(columns=['pred_Z_mean', 'pred_Z_std'], inplace=True)

    Comp['pred_Z_mean'], Comp['pred_Z_std'] = pred_mean, pred_std

    os.makedirs('Results', exist_ok=True)
    Comp.to_excel('Results/First_round_ensemble_comp_Gh.xlsx', index=False)
    First_round_Comp_total.to_excel('Results/First_round_ensemble_Comp_total_Gh.xlsx', index=False)


ensemble_prediction()

Data columns: Index(['Ba', 'Ca', 'Sr', 'Ti', 'Zr', 'Sn', 'Hf', 'W', 'EI', 'EA', 'μ'], dtype='object')

==== Round 0: GBDT and NN Prediction ====
Loading pre-trained GBDT model from Results/STU_GBDT_BO(100+150)_1/198-seed_1.pkl
Loading NN weights from file: Results/STU_NN_BO(100+150)_1/135-seed_1.pt


  net.load_state_dict(torch.load(weight_file))



==== Round 1: GBDT and NN Prediction ====
Loading pre-trained GBDT model from Results/STU_GBDT_BO(100+150)_1/210-seed_1.pkl
Loading NN weights from file: Results/STU_NN_BO(100+150)_1/171-seed_1.pt

==== Round 2: GBDT and NN Prediction ====
Loading pre-trained GBDT model from Results/STU_GBDT_BO(100+150)_1/245-seed_1.pkl
Loading NN weights from file: Results/STU_NN_BO(100+150)_1/196-seed_1.pt


  net.load_state_dict(torch.load(weight_file))
  net.load_state_dict(torch.load(weight_file))



==== Round 3: GBDT and NN Prediction ====
Loading pre-trained GBDT model from Results/STU_GBDT_BO(100+150)_1/235-seed_1.pkl
Loading NN weights from file: Results/STU_NN_BO(100+150)_1/55-seed_1.pt

==== Round 4: GBDT and NN Prediction ====
Loading pre-trained GBDT model from Results/STU_GBDT_BO(100+150)_1/208-seed_1.pkl
Loading NN weights from file: Results/STU_NN_BO(100+150)_1/99-seed_1.pt


  net.load_state_dict(torch.load(weight_file))
  net.load_state_dict(torch.load(weight_file))



==== Round 5: GBDT and NN Prediction ====
Loading pre-trained GBDT model from Results/STU_GBDT_BO(100+150)_1/213-seed_1.pkl
Loading NN weights from file: Results/STU_NN_BO(100+150)_1/175-seed_1.pt


  net.load_state_dict(torch.load(weight_file))
