In [None]:
!pip install --quiet --upgrade \
    dgl==1.1.2 dgllife==0.3.2 rdkit-pypi==2022.9.5 lightgbm

# Optuna
!pip install optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m4.3/6.0 MB[0m [31m109.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m5.9/6.0 MB[0m [31m110.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.1/226.1 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m91.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import dgl
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import Descriptors
from dgllife.model import GINPredictor
from dgllife.utils import mol_to_bigraph, PretrainAtomFeaturizer

In [None]:
# --- Configuration ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# drive 연결

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 1. Load data
data_dir = '/content/drive/MyDrive/DACON/CYP3A4inh/'

train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
test = pd.read_csv(os.path.join(data_dir, 'test.csv'))

In [None]:
import os
import numpy as np
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F

from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, MACCSkeys, Lipinski
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem.EState import Fingerprinter

from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.pipeline import Pipeline

import optuna
from optuna.pruners import MedianPruner
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from transformers import AutoTokenizer, AutoModel

import dgl
from dgl.nn import GINConv, SumPooling
from dgllife.model import GINPredictor
from dgllife.utils import mol_to_bigraph, PretrainAtomFeaturizer, CanonicalAtomFeaturizer

# 데이터 디렉토리
data_dir = '/content/drive/MyDrive/DACON/CYP3A4inh/'
os.makedirs(data_dir, exist_ok=True)

In [None]:
# --- SMILES Augmentation ---
train['orig_id'] = np.arange(len(train))
augment_ratio = 20
augmented = []
for _, row in train.iterrows():
    smiles = row['Canonical_Smiles']
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        continue
    base_dict = row.to_dict()
    augmented.append(base_dict)
    for _ in range(augment_ratio):
        aug_smiles = Chem.MolToSmiles(mol, doRandom=True)
        new_row = base_dict.copy()
        new_row['Canonical_Smiles'] = aug_smiles
        augmented.append(new_row)
train = pd.DataFrame(augmented).reset_index(drop=True)
groups = train['orig_id'].values  # GroupKFold용

# --- Extra Descriptors ---
extra_descs = [Descriptors.HeavyAtomCount, Descriptors.RingCount,
               Descriptors.FractionCSP3, Descriptors.NumRotatableBonds]
for f in extra_descs:
    train[f.__name__] = train['Canonical_Smiles'].apply(lambda s: f(Chem.MolFromSmiles(s)))
    test[f.__name__] = test['Canonical_Smiles'].apply(lambda s: f(Chem.MolFromSmiles(s)))

In [None]:
# --- ECFP ---
def get_ecfp(s, radius=2, n_bits=2048):
    try:
        mol = Chem.MolFromSmiles(s)
        if mol is None:
            return [0] * n_bits
        bitvect = AllChem.GetMorganFingerprintAsBitVect(
            mol, radius=radius, nBits=n_bits
        )
        return [int(bit) for bit in bitvect]

    except Exception:
        return [0] * n_bits

train_ecfp = pd.DataFrame(train['Canonical_Smiles'].apply(get_ecfp).tolist(), columns=[f'FP_{i}' for i in range(2048)])
test_ecfp = pd.DataFrame(test['Canonical_Smiles'].apply(get_ecfp).tolist(), columns=[f'FP_{i}' for i in range(2048)])

train = pd.concat([train, train_ecfp], axis=1)
test = pd.concat([test, test_ecfp], axis=1)

# --- 분자 특성 추가 ---
def get_molecule_descriptors(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return [0] * 2300  # 새 길이에 맞게 늘려주세요

        # 1) 기본 descriptors
        basic = [
            Descriptors.MolWt(mol),
            Descriptors.MolLogP(mol),
            Descriptors.NumHAcceptors(mol),
            Descriptors.NumHDonors(mol),
            Descriptors.TPSA(mol),
            Descriptors.NumRotatableBonds(mol),
            Descriptors.NumAromaticRings(mol),
            Descriptors.NumHeteroatoms(mol),
            Descriptors.FractionCSP3(mol),
            Descriptors.NumAliphaticRings(mol),
            Lipinski.NumAromaticHeterocycles(mol),
            Lipinski.NumSaturatedHeterocycles(mol),
            Lipinski.NumAliphaticHeterocycles(mol),
            Descriptors.HeavyAtomCount(mol),
            Descriptors.RingCount(mol),
            Descriptors.NOCount(mol),
            Descriptors.NHOHCount(mol),
            Descriptors.NumRadicalElectrons(mol),
        ]

        # 2) PEOE_VSA (14차원)
        vsa = list(rdMolDescriptors.PEOE_VSA_Fingerprint(mol))

        # 3) EState indices
        estate = list(Fingerprinter.FingerprintMol(mol))

        # 4) Topological descriptors
        balaban = [rdMolDescriptors.CalcBalabanJ(mol)]
        wiener  = [rdMolDescriptors.CalcWienerIndex(mol)]

        # 5) Morgan + MACCS
        morgan_bits = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        morgan = [int(b) for b in morgan_bits.ToBitString()]
        maccs_bits = MACCSkeys.GenMACCSKeys(mol)
        maccs  = [int(b) for b in maccs_bits.ToBitString()]

        # 합치기
        all_feats = basic + vsa + estate + balaban + wiener + morgan + maccs
        return all_feats

    except:
        return [0] * 2300

print("get_molecule_descriptors 추출 중...")
train_desc = np.array(train['Canonical_Smiles'].apply(get_molecule_descriptors).tolist())
test_desc = np.array(test['Canonical_Smiles'].apply(get_molecule_descriptors).tolist())

get_molecule_descriptors 추출 중...


In [None]:
np.save(os.path.join(data_dir, 'train_desc.npy'), train_desc)
np.save(os.path.join(data_dir, 'test_desc.npy'), test_desc)

In [None]:
train_desc = np.load(os.path.join(data_dir, 'train_desc.npy'))
test_desc = np.load(os.path.join(data_dir, 'test_desc.npy'))

In [None]:
# --- ChemBERTa Embedding ---
tokenizer = AutoTokenizer.from_pretrained('seyonec/ChemBERTa-zinc-base-v1')
model = AutoModel.from_pretrained('seyonec/ChemBERTa-zinc-base-v1')
model.eval()

def compute_smiles_embedding(smiles_list, batch_size=32):
    embs = []
    with torch.no_grad():
        for i in range(0, len(smiles_list), batch_size):
            toks = tokenizer(smiles_list[i:i+batch_size], return_tensors='pt', padding=True)
            out = model(**toks)
            embs.append(out.last_hidden_state[:,0,:].cpu().numpy())
    return np.vstack(embs)

train_smiles = train['Canonical_Smiles'].tolist()
train_emb = compute_smiles_embedding(train_smiles)
np.save(os.path.join(data_dir, 'train_emb.npy'), train_emb)
print('Train embeddings saved.')

test_smiles = test['Canonical_Smiles'].tolist()
test_emb = compute_smiles_embedding(test_smiles)
np.save(os.path.join(data_dir, 'test_emb.npy'), test_emb)
print('Test embeddings saved.')


KeyboardInterrupt: 

In [None]:
# --- GNN Embedding ---
node_featurizer = CanonicalAtomFeaturizer()

def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return mol_to_bigraph(mol, node_featurizer=node_featurizer)

print("GNN 임베딩 추출 중...")
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, output_dim=128):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        return self.layers(x)

class GINBackbone(nn.Module):
    def __init__(self, input_dim, hidden_dims):
        super().__init__()
        self.layers = nn.ModuleList()
        for hid_dim in hidden_dims:
            mlp = nn.Sequential(
                nn.Linear(input_dim, hid_dim),
                nn.ReLU(),
                nn.Linear(hid_dim, hid_dim)
            )
            self.layers.append(GINConv(mlp, 'sum'))
            input_dim = hid_dim
        self.pool = SumPooling()
    def forward(self, g, feats):
        h = feats
        for layer in self.layers:
            h = layer(g, h)
        hg = self.pool(g, h)
        return hg

gnn_backbone = GINBackbone(input_dim=node_featurizer.feat_size(), hidden_dims=[128,128,128])
mlp_predictor = MLP(input_dim=128, hidden_dim=256, output_dim=128)
model_gnn = nn.Sequential(gnn_backbone, mlp_predictor)
model_gnn.eval()

def get_gnn_features(df_smiles):
    graphs = [smiles_to_graph(s) for s in df_smiles]
    feats = []
    for g in graphs:
        if g is not None:
            with torch.no_grad():
                x = g.ndata['h'].float()
                hg = gnn_backbone(g, x)
                out = mlp_predictor(hg)
                feats.append(out.numpy())
        else:
            feats.append(np.zeros(128))
    return np.vstack(feats)

train_gnn = get_gnn_features(train['Canonical_Smiles'])
test_gnn = get_gnn_features(test['Canonical_Smiles'])

In [None]:
train_gnn = np.load(os.path.join(data_dir, 'train_gnn.npy'))
test_gnn = np.load(os.path.join(data_dir, 'test_gnn.npy'))

In [None]:
# --- Scaling and Stacking ---

# 1) Descriptor: 이상치가 많을 수 있으니 MinMaxScaler
desc_scaler = MinMaxScaler()
train_desc = desc_scaler.fit_transform(train_desc)
test_desc = desc_scaler.transform(test_desc)

# 2) ChemBERTa 임베딩: 비교적 분포가 균일하니 StandardScaler
emb_scaler = StandardScaler()
train_emb = emb_scaler.fit_transform(train_emb)
test_emb = emb_scaler.transform(test_emb)

# 3) GNN 임베딩: 역시 StandardScaler
gnn_scaler = StandardScaler()
train_gnn = gnn_scaler.fit_transform(train_gnn)
test_gnn = gnn_scaler.transform(test_gnn)

X_train = np.hstack([train_desc, train_emb, train_gnn])
X_test = np.hstack([test_desc, train_emb, test_gnn])

In [None]:
# # PCA 학습 및 train 임베딩 차원 축소
# pca = PCA(n_components=100)
# train_emb_pca = pca.fit_transform(train_emb)

# # PCA 모델 저장
# with open(os.path.join(data_dir, 'pca_model.pkl'), 'wb') as f:
#     pickle.dump(pca, f)

# # train 임베딩 PCA 결과 저장
# np.save(os.path.join(data_dir, 'train_emb_pca.npy'), train_emb_pca)
# print('PCA model and train_emb_pca saved.')

# # test 임베딩 로드 및 PCA 변환 후 저장
# test_emb = np.load(os.path.join(data_dir, 'test_emb.npy'))
# test_emb_pca = pca.transform(test_emb)
# np.save(os.path.join(data_dir, 'test_emb_pca.npy'), test_emb_pca)
# print('Test embeddings PCA transformed and saved.')

In [None]:
# -- Scaffold 계산 --
def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return ""
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold)

train['scaffold'] = train['Canonical_Smiles'].apply(get_scaffold)

In [None]:
np.save(f"{data_dir}/X_train_final.npy", X_train)
np.save(f"{data_dir}/X_test_final.npy", X_test)

In [None]:
X_train = np.load(f"{data_dir}/X_train_final.npy")
X_test = np.load(f"{data_dir}/X_test_final.npy")

In [None]:
y_train = train['Inhibition'].values
# groups = train['orig_id'].values
groups = train['scaffold'].values

# --- 평가 지표 ---
def nrmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred)) / (y_true.max() - y_true.min())
def final_score(y_true, y_pred):
    A = nrmse(y_true, y_pred)
    B, _ = pearsonr(y_true, y_pred)
    return 0.5 * (1 - min(A, 1)) + 0.5 * B

# --- Optuna + XGB ---
sampler = optuna.samplers.TPESampler(seed=42)
pruner  = MedianPruner(n_startup_trials=1, n_warmup_steps=1)
study   = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int    ('n_estimators',   50, 300),
        'max_depth':    trial.suggest_int    ('max_depth',      3, 12),
        'learning_rate':trial.suggest_float  ('learning_rate',  1e-3, 0.1, log=True),
        'subsample':    trial.suggest_float  ('subsample',      0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha':    trial.suggest_float  ('reg_alpha',      1e-8, 10.0, log=True),
        'reg_lambda':   trial.suggest_float  ('reg_lambda',     1e-8, 10.0, log=True),
        'random_state': 42,
        'n_jobs':      -1
    }

    oof = np.zeros(len(y_train))
    cv  = GroupKFold(n_splits=3)
    for tr_idx, vl_idx in cv.split(X_train, y_train, groups):
        model = XGBRegressor(**params)
        model.fit(
            X_train[tr_idx], y_train[tr_idx],
            eval_set=[(X_train[vl_idx], y_train[vl_idx])],
            verbose=False
        )
        oof[vl_idx] = model.predict(X_train[vl_idx])

    return final_score(y_train, oof)

# 최적화 실행
study.optimize(objective, n_trials=10, timeout=10800)
print('Best params:', study.best_trial.params)
print('Best Score:', study.best_value)

In [None]:
# # --- Optuna + LGB ---
# sampler = optuna.samplers.TPESampler(seed=42)
# pruner = MedianPruner(n_startup_trials=1, n_warmup_steps=1)
# study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)

# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 50, 200),
#         'learning_rate': trial.suggest_float('lr', 1e-3, 5e-2, log=True),
#         'num_leaves': trial.suggest_int('leaves', 16, 64),
#         'min_child_samples': trial.suggest_int('min_child', 1, 10),
#         'min_split_gain': trial.suggest_float('gain', 0.0, 0.2),
#         'max_depth': -1
#     }
#     oof = np.zeros(len(y_train))
#     cv = GroupKFold(n_splits=3)
#     for tr_idx, vl_idx in cv.split(X_train, y_train, groups):
#         model = LGBMRegressor(**params, random_state=42, n_jobs=-1, verbose=-1)
#         model.fit(X_train[tr_idx], y_train[tr_idx])
#         oof[vl_idx] = model.predict(X_train[vl_idx])
#     return final_score(y_train, oof)

# study.optimize(objective, n_trials=10, timeout=10800)
# print('Best params:', study.best_trial.params)
# print('Best Score:', study.best_value)
# best_params = study.best_trial.params

In [None]:
best_params = study.best_trial.params
save_path = os.path.join(data_dir, 'best_params.pkl')

with open(save_path, 'wb') as f:
    pickle.dump(best_params, f)

print(f"Best params saved to {save_path}")

Best params saved to /content/drive/MyDrive/DACON/CYP3A4inh/best_params.pkl


In [None]:
import pickle

load_path = os.path.join(data_dir, 'best_params.pkl')

with open(load_path, 'rb') as f:
    best_params = pickle.load(f)

print("Loaded best params:", best_params)

Loaded best params: {'n_estimators': 106, 'lr': 0.04123206532618727, 'leaves': 51, 'min_child': 6, 'gain': 0.031203728088487304}


In [None]:
# param_fix = {
#     'lr': 'learning_rate',
#     'leaves': 'num_leaves',
#     'min_child': 'min_child_samples',
#     'gain': 'min_split_gain'
# }
# best_params_fixed = {param_fix.get(k, k): v for k, v in best_params.items()}

In [None]:
# from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor
# from sklearn.model_selection import GroupKFold
# from lightgbm import LGBMRegressor
# from sklearn.metrics import mean_squared_error

# oof_preds = np.zeros(len(y_train))
# gkf = GroupKFold(n_splits=3)

# for fold, (tr_idx, val_idx) in enumerate(gkf.split(X_train, y_train, groups)):
#     X_tr, X_vl = X_train[tr_idx], X_train[val_idx]
#     y_tr, y_vl = y_train[tr_idx], y_train[val_idx]

#     base_learners = [
#         ('lgb', LGBMRegressor(**best_params_fixed, random_state=fold)),
#         ('gbr', GradientBoostingRegressor(n_estimators=200, random_state=fold))
#     ]

#     stack = StackingRegressor(
#         estimators=base_learners,
#         final_estimator=LGBMRegressor(n_estimators=200, random_state=fold)
#     )

#     stack.fit(X_tr, y_tr)
#     oof_preds[val_idx] = stack.predict(X_vl)

#     print(f"Fold {fold} RMSE: {np.sqrt(mean_squared_error(y_vl, oof_preds[val_idx])):.4f}")
#     print(f"Fold {fold} Score: {final_score(y_vl, oof_preds[val_idx]):.4f}")

In [None]:
# stack.fit(X_train, y_train)
# test_preds = stack.predict(X_test)

# submission = pd.DataFrame({'ID': test['ID'], 'Inhibition': test_preds})
# submission.to_csv(os.path.join(data_dir, 'submission_0714_04.csv'), index=False)
# print("Submission saved.")

### Ensemble 방식 적용

In [None]:
from sklearn import set_config
set_config(enable_metadata_routing=False)

import time
from tqdm.auto import tqdm
import json
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import BayesianRidge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GroupKFold, cross_val_predict
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# ── 베이스 러너 정의 ──
base_learners = [
    ('lgb', LGBMRegressor(**best_params, random_state=0)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=0)),
    ('gbr', GradientBoostingRegressor(n_estimators=200, random_state=0)),
    ('knn', KNeighborsRegressor(n_neighbors=10)),
    ('svr', SVR(kernel='rbf', C=1.0)),
    ('br', BayesianRidge())
]

# ── OOF feature 생성 함수 (cross_val_predict) ──
def make_oof_features(estimators, X, y, groups):

    gkf = GroupKFold(n_splits=3)
    n_learners = len(estimators)
    oof_feats = np.zeros((X.shape[0], n_learners))

    for i, (name, est) in enumerate(estimators):
        start = time.time()
        oof_feats[:, i] = cross_val_predict(
            est,
            X, y,
            groups=groups,
            cv=gkf,
            n_jobs=-1,
            method='predict'
        )
        elapsed = time.time() - start
        print(f"  • [{name}] done in {elapsed:.1f}s")
    return oof_feats

# ── 평가지표 ──
def final_score(y_true, y_pred):
    A = np.sqrt(mean_squared_error(y_true, y_pred)) / (y_true.max() - y_true.min())
    B = np.corrcoef(y_true, y_pred)[0, 1]
    return 0.5 * (1 - min(A, 1)) + 0.5 * B

# ── Optuna 메타‑러너 튜닝 ──
X_oof = make_oof_features(base_learners, X_train, y_train, groups)

def meta_objective(trial):
    alpha    = trial.suggest_float('meta_alpha',    1e-4, 10.0, log=True)
    l1_ratio = trial.suggest_float('meta_l1_ratio', 0.0, 1.0)
    meta = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=10_000, random_state=42)
    # meta = TransformedTargetRegressor(
    #     regressor=ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=10000, random_state=42),
    #     func=np.log1p, inverse_func=np.expm1
    # )                                         # 성능이 오히려 떨어지므로, skewed 아님

    # OOF CV로 메타-러너 검증
    gkf = GroupKFold(n_splits=3)
    oof_meta = np.zeros(len(y_train))
    for tr_idx, val_idx in gkf.split(X_oof, y_train, groups):
        meta.fit(X_oof[tr_idx], y_train[tr_idx])
        oof_meta[val_idx] = meta.predict(X_oof[val_idx])
    return final_score(y_train, oof_meta)

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(meta_objective, n_trials=5, timeout=3600, show_progress_bar=True)

# ── 결과 저장 ──
best_meta = study.best_trial.params
to_save = {
    'base_best_params': best_params,
    'meta_alpha':    best_meta['meta_alpha'],
    'meta_l1_ratio': best_meta['meta_l1_ratio']
}
with open('ensemble_params.json', 'w') as f:
    json.dump(to_save, f, indent=2)

  • [lgb] done in 47.8s




  • [rf] done in 941.6s
  • [gbr] done in 1113.1s
  • [knn] done in 24.2s
  • [svr] done in 714.4s


[I 2025-07-15 09:36:49,049] A new study created in memory with name: no-name-0024be1d-1d9d-4e6d-8b3f-ce0b55f3e2e1


  • [br] done in 456.2s


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-07-15 09:36:49,163] Trial 0 finished with value: 0.5681665016297952 and parameters: {'meta_alpha': 0.0074593432857265485, 'meta_l1_ratio': 0.9507143064099162}. Best is trial 0 with value: 0.5681665016297952.
[I 2025-07-15 09:36:49,213] Trial 1 finished with value: 0.5725889198198411 and parameters: {'meta_alpha': 0.4570563099801455, 'meta_l1_ratio': 0.5986584841970366}. Best is trial 1 with value: 0.5725889198198411.
[I 2025-07-15 09:36:49,254] Trial 2 finished with value: 0.5679017794474708 and parameters: {'meta_alpha': 0.0006026889128682511, 'meta_l1_ratio': 0.15599452033620265}. Best is trial 1 with value: 0.5725889198198411.
[I 2025-07-15 09:36:49,289] Trial 3 finished with value: 0.5679040634746204 and parameters: {'meta_alpha': 0.00019517224641449495, 'meta_l1_ratio': 0.8661761457749352}. Best is trial 1 with value: 0.5725889198198411.
[I 2025-07-15 09:36:49,327] Trial 4 finished with value: 0.5704055285738808 and parameters: {'meta_alpha': 0.10129197956845731, 'meta_l1_

In [None]:
def meta_objective(trial):
    alpha    = trial.suggest_float('meta_alpha',    1e-4, 10.0, log=True)
    l1_ratio = trial.suggest_float('meta_l1_ratio', 0.0, 1.0)
    meta = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=10_000, random_state=42)

    # OOF CV로 메타-러너 검증
    gkf = GroupKFold(n_splits=3)
    oof_meta = np.zeros(len(y_train))
    for tr_idx, val_idx in gkf.split(X_oof, y_train, groups):
        meta.fit(X_oof[tr_idx], y_train[tr_idx])
        oof_meta[val_idx] = meta.predict(X_oof[val_idx])
    return final_score(y_train, oof_meta)

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(meta_objective, n_trials=200, timeout=10800, show_progress_bar=True)

# ── 결과 저장 ──
best_meta = study.best_trial.params
to_save = {
    'base_best_params': best_params,
    'meta_alpha':    best_meta['meta_alpha'],
    'meta_l1_ratio': best_meta['meta_l1_ratio']
}
with open('ensemble_params.json', 'w') as f:
    json.dump(to_save, f, indent=2)

[I 2025-07-15 09:39:45,306] A new study created in memory with name: no-name-271382d2-2fa1-43fd-bbd8-e2d3699749b6


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-07-15 09:39:45,381] Trial 0 finished with value: 0.5887278474761072 and parameters: {'meta_alpha': 0.0074593432857265485, 'meta_l1_ratio': 0.9507143064099162}. Best is trial 0 with value: 0.5887278474761072.
[I 2025-07-15 09:39:45,432] Trial 1 finished with value: 0.5888631333058753 and parameters: {'meta_alpha': 0.4570563099801455, 'meta_l1_ratio': 0.5986584841970366}. Best is trial 1 with value: 0.5888631333058753.
[I 2025-07-15 09:39:45,483] Trial 2 finished with value: 0.588727002602847 and parameters: {'meta_alpha': 0.0006026889128682511, 'meta_l1_ratio': 0.15599452033620265}. Best is trial 1 with value: 0.5888631333058753.
[I 2025-07-15 09:39:45,525] Trial 3 finished with value: 0.5887267363214199 and parameters: {'meta_alpha': 0.00019517224641449495, 'meta_l1_ratio': 0.8661761457749352}. Best is trial 1 with value: 0.5888631333058753.
[I 2025-07-15 09:39:45,567] Trial 4 finished with value: 0.5887526824257934 and parameters: {'meta_alpha': 0.10129197956845731, 'meta_l1_r

In [None]:
with open('ensemble_params.json', 'r') as f:
    saved = json.load(f)
print(saved)

{'base_best_params': {'n_estimators': 106, 'lr': 0.04123206532618727, 'leaves': 51, 'min_child': 6, 'gain': 0.031203728088487304}, 'meta_alpha': 9.968535139653175, 'meta_l1_ratio': 0.004616628626512666}


In [None]:
# 1) Load saved params
with open('ensemble_params.json', 'r') as f:
    saved = json.load(f)

raw_params = saved['base_best_params']
corrected_params = {}
param_mapping = {
    'leaves': 'num_leaves',
    'lr': 'learning_rate',
    'min_child': 'min_child_samples'
}
for key, value in raw_params.items():
    if key in param_mapping:
        corrected_params[param_mapping[key]] = value
    elif key != 'gain':  # 'gain'은 제거
        corrected_params[key] = value


# 2) Reconstruct base learners
base_learners = [
    ('lgb', LGBMRegressor(**corrected_params, random_state=0)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=0)),
    ('gbr', GradientBoostingRegressor(n_estimators=200, random_state=0)),
    ('knn', KNeighborsRegressor(n_neighbors=10)),
    ('svr', SVR(kernel='rbf', C=1.0)),
    ('br', BayesianRidge())
]

# 3) Reconstruct meta-learner
meta = ElasticNet(
    alpha=saved['meta_alpha'],
    l1_ratio=saved['meta_l1_ratio'],
    max_iter=10000,
    random_state=42
)

# 4) Fit base learners on full training data
for _, est in base_learners:
    est.fit(X_train, y_train)

# 5) Generate OOF-features (reuse the function from Cell 1)
X_oof = make_oof_features(base_learners, X_train, y_train, groups)

# 6) Train meta-learner on those OOF-features
meta.fit(X_oof, y_train)

# 7) Final OOF evaluation
oof_preds = meta.predict(X_oof)
A = np.sqrt(mean_squared_error(y_train, oof_preds)) / (y_train.max() - y_train.min())
B = np.corrcoef(y_train, oof_preds)[0, 1]
print(f"Final OOF Score: {0.5*(1-min(A,1))+0.5*B:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.573436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 72286
[LightGBM] [Info] Number of data points in the train set: 5043, number of used features: 4485
[LightGBM] [Info] Start training from score 33.221831
  • [lgb] done in 48.1s
  • [rf] done in 529.8s
  • [gbr] done in 538.2s
  • [knn] done in 6.9s
  • [svr] done in 191.4s
  • [br] done in 143.3s
Final OOF Score: 0.5904


In [None]:
meta

In [None]:
X_test_oof = np.column_stack([
    est.predict(X_test) for _, est in base_learners
])

# 3. Fit meta model correctly!
meta.fit(X_oof, y_train)  # ✅ 반드시 OOF 기반으로 학습해야 함

# 4. Predict test
test_preds = meta.predict(X_test_oof)

submission = pd.DataFrame({'ID': test['ID'], 'Inhibition': test_preds})
submission.to_csv(os.path.join(data_dir, 'submission_0715_03.csv'), index=False)
print("Submission saved.")



Submission saved.


In [None]:
test_preds

array([40.90410818, 35.3782997 , 31.32129982, 38.20618554, 37.63725646,
       26.22450996, 33.25893206, 24.0843206 , 34.96050264, 19.6827928 ,
       18.47347827, 25.36793227, 26.04433582, 23.4568672 , 25.4774488 ,
       20.41936059, 26.16882413, 51.5264106 , 43.4185765 , 28.60267098,
       33.08508428, 33.58853819, 38.13592569, 35.53305165, 33.93249271,
       40.21294551, 32.02588557, 32.04831481, 28.75610032, 28.72729213,
       42.82861752, 30.0668528 , 25.43542533, 42.19277086, 28.1090521 ,
       23.53799586, 58.82139459, 25.31951967, 25.36711289, 27.57170033,
       38.59130293, 41.0815515 , 23.42359877, 33.60359157, 31.12940712,
       47.48002059, 46.08671849, 51.98373976, 34.45073204, 30.76498685,
       36.6486429 , 54.83776136, 45.21187885, 25.92299236, 28.48678542,
       60.96293502, 30.01805172, 19.60141438, 43.24069935, 34.65434862,
       15.15459514, 29.88779834, 23.43479511, 31.6730739 , 38.28251407,
       24.84495354, 37.83541608, 37.94069079, 34.6876126 , 24.62