### Import

In [17]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, Descriptors, Lipinski
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
import optuna
import os
import random
from sklearn.model_selection import train_test_split

In [19]:
CFG = {
    'NBITS': 2048,
    'SEED': 42,
    'N_SPLITS': 5,
    'N_TRIALS': 50 
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED'])

# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

In [7]:
def IC50_to_pIC50(ic50_nM):
    ic50_nM = np.clip(ic50_nM, 1e-10, None)
    return 9 - np.log10(ic50_nM)

In [8]:
def pIC50_to_IC50(pIC50):
    return 10 ** (9 - pIC50)

In [21]:
def get_score(y_true_ic50, y_pred_ic50, y_true_pic50, y_pred_pic50):
    rmse = mean_squared_error(y_true_ic50, y_pred_ic50) ** 0.5
    nrmse = rmse / (np.max(y_true_ic50) - np.min(y_true_ic50))
    A = 1 - min(nrmse, 1)
    B = r2_score(y_true_pic50, y_pred_pic50)
    score = 0.4 * A + 0.6 * B
    return score

### Data loading & Molcular descriptor

In [9]:
IC50_dataset = pd.read_csv("C:/Users/user/Desktop/dacon_drug_development/IC50_dataset.csv")
IC50_dataset['Fingerprint'] = IC50_dataset['smiles'].apply(smiles_to_fingerprint)



In [10]:
def descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    f = {}
    # 1. 화학기 존재 여부
    # Tetrazole ring
    tetrazole_smarts = ['c1nn[n+](n1)[O-]', 'c1[nH]nnn1']
    f['has_tetrazole'] = any(mol.HasSubstructMatch(Chem.MolFromSmarts(s)) for s in tetrazole_smarts)
    # Triazole ring
    f['has_triazole'] = mol.HasSubstructMatch(Chem.MolFromSmarts('c1nnc(n1)'))
    # Sulfoxide group
    f['has_sulfoxide'] = mol.HasSubstructMatch(Chem.MolFromSmarts('S(=O)(C)'))
    # Amide carbonyl
    amide_smarts = ['C(=O)N', 'NC(=O)']
    f['has_amide'] = any(mol.HasSubstructMatch(Chem.MolFromSmarts(s)) for s in amide_smarts)
    # Sulfonamide group
    f['has_sulfonamide'] = mol.HasSubstructMatch(Chem.MolFromSmarts('S(=O)(=O)N'))
    
    # 2. 분자량
    mw = Descriptors.MolWt(mol)
    f['mw'] = mw
    f['mw_300_500'] = 300<=mw<=500
    
    # 3. logP
    logp = Descriptors.MolLogP(mol)
    f['logP'] = logp
    f['logP_2_4'] = 2<=logp<=4
    
    # 4. TPSA
    tpsa = Descriptors.TPSA(mol)
    f['TPSA'] = tpsa
    f['TPSA_60_120'] = 60<=tpsa<=120
    
    # 5. rotatable bonds (IC50 값 높임_bad)
    rot = Lipinski.NumRotatableBonds(mol)
    f['rotatable'] = rot
    f['rot_gt7'] = rot>7

    # 6. ring (+aromatic)
    # ring 2개 이상
    num_rings = mol.GetRingInfo().NumRings()
    f['num_rings'] = num_rings
    f['ring_count_ge_2'] = num_rings >= 2
    # aromatic ring 존재 여부
    ssr = Chem.GetSymmSSSR(mol)
    aromatic_ring = any(all(mol.GetAtomWithIdx(idx).GetIsAromatic() for idx in ring) for ring in ssr)
    f['has_aromatic_ring'] = aromatic_ring
    # Ring 2개 이상 + 그 중 적어도 하나 aromatic인가?
    f['ring_ge2_and_aromatic'] = f['ring_count_ge_2'] and f['has_aromatic_ring']

    return f

In [13]:
features_df = pd.DataFrame([descriptors(s) for s in IC50_dataset['smiles']])
final_dataset = pd.concat([IC50_dataset, features_df], axis=1)

### Train/Validation data split

In [14]:
# XGBoost에 넣으려면, Fingerprint를 실제 숫자 벡터 컬럼으로 풀어줘야 함.
# 예를 들어, [0, 1, 0, 1] → fp_0=0, fp_1=1, fp_2=0, fp_3=1 이렇게 컬럼별 숫자로 변환.

# 1. Fingerprint 컬럼을 여러 개 숫자 컬럼으로 변환
fp_df = pd.DataFrame(final_dataset['Fingerprint'].tolist(), index=final_dataset.index)

# 2. 컬럼 이름 부여 (fp_0, fp_1, ...)
fp_df.columns = [f'fp_{i}' for i in range(fp_df.shape[1])]

# 3. 기존 데이터에서 Fingerprint 컬럼 제거 후 합치기
final_dataset = pd.concat(
    [final_dataset.drop(columns=['Fingerprint']), fp_df],
    axis=1
)

In [43]:
# Define X (features) and y (target)
X = final_dataset.drop(['smiles', 'IC50_nM', 'pIC50'], axis=1)
y = final_dataset['pIC50']

In [44]:
# Split the data into training and testing sets
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Building LGBM model

In [47]:
def objective(trial, X, y):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbose': -1,
        'n_jobs': -1,
        'boosting_type': 'gbdt',
        'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'seed': 42
    }

    kf = KFold(n_splits=CFG['N_SPLITS'], shuffle=True, random_state=CFG['SEED'])
    oof_preds = np.zeros(len(X_trainval))

    for train_idx, val_idx in kf.split(X_trainval, y_trainval):
        X_train, X_val = X_trainval.iloc[train_idx], X_trainval.iloc[val_idx]
        y_train, y_val = y_trainval.iloc[train_idx], y_trainval.iloc[val_idx]
        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_train, y_train, 
            eval_set=[(X_val, y_val)],
            eval_metric='rmse', 
            callbacks=[lgb.early_stopping(100, verbose=False)]
        )
        oof_preds[val_idx] = model.predict(X_val)

    y_ic50_true = pIC50_to_IC50(y_trainval)
    oof_ic50_preds = pIC50_to_IC50(oof_preds)
    score = get_score(y_ic50_true, oof_ic50_preds, y_trainval, oof_preds)
    return score

In [48]:
study = optuna.create_study(direction='maximize', study_name='lgbm_tuning')
study.optimize(lambda trial: objective(trial, X_trainval, y_trainval), n_trials=CFG['N_TRIALS'])


[I 2025-08-09 03:07:59,973] A new study created in memory with name: lgbm_tuning
[I 2025-08-09 03:08:02,007] Trial 0 finished with value: 0.5616939499761449 and parameters: {'learning_rate': 0.05602081339073197, 'num_leaves': 93, 'max_depth': 9, 'feature_fraction': 0.9534772343715444, 'bagging_fraction': 0.8633877911794047, 'bagging_freq': 2, 'min_child_samples': 23}. Best is trial 0 with value: 0.5616939499761449.
[I 2025-08-09 03:08:03,459] Trial 1 finished with value: 0.5669105068069786 and parameters: {'learning_rate': 0.09299335492050682, 'num_leaves': 49, 'max_depth': 8, 'feature_fraction': 0.954420391598969, 'bagging_fraction': 0.9015226149181798, 'bagging_freq': 6, 'min_child_samples': 23}. Best is trial 1 with value: 0.5669105068069786.
[I 2025-08-09 03:08:04,844] Trial 2 finished with value: 0.5696399812866394 and parameters: {'learning_rate': 0.05366852254565003, 'num_leaves': 43, 'max_depth': 4, 'feature_fraction': 0.7120952397599377, 'bagging_fraction': 0.9808223536588276,

In [52]:
best_params = study.best_params
print("Best Parameters:", best_params)

Best Parameters: {'learning_rate': 0.07918643767549895, 'num_leaves': 45, 'max_depth': 4, 'feature_fraction': 0.7041345371704105, 'bagging_fraction': 0.7933775781398964, 'bagging_freq': 6, 'min_child_samples': 12}


In [53]:
best_params.update({
    'objective': 'regression',
    'metric': 'rmse',
    'verbose': -1,
    'n_jobs': -1,
    'seed': CFG['SEED'],
    'boosting_type': 'gbdt',
    'n_estimators': 2000
})

In [51]:
final_model = lgb.LGBMRegressor(**best_params)
final_model.fit(X_trainval, y_trainval)

0,1,2
,boosting_type,'gbdt'
,num_leaves,45
,max_depth,4
,learning_rate,0.07918643767549895
,n_estimators,2000
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


### Test

In [54]:
test = pd.read_csv("C:/Users/user/Desktop/dacon_drug_development/dataset/test.csv")
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)
test = test[test['Fingerprint'].notnull()]

test_features_df = pd.DataFrame([descriptors(s) for s in test['Smiles']])
test_final_dataset = pd.concat([test, test_features_df], axis=1)
fp_df = pd.DataFrame(test_final_dataset['Fingerprint'].tolist(), index=test_final_dataset.index)
fp_df.columns = [f'fp_{i}' for i in range(fp_df.shape[1])]
test_final_dataset = pd.concat(
    [test_final_dataset.drop(columns=['Fingerprint']), fp_df],
    axis=1
)

X_test = test_final_dataset.drop(['ID', 'Smiles'], axis=1)



In [55]:
test['pIC50_pred'] = final_model.predict(X_test)
test['ASK1_IC50_nM'] = pIC50_to_IC50(test['pIC50_pred'])

### Submission

In [56]:
submission = pd.read_csv('C:/Users/user/Desktop/dacon_drug_development/sample_submission.csv') 
submission['ASK1_IC50_nM'] = test['ASK1_IC50_nM']

In [57]:
submission

Unnamed: 0,ID,ASK1_IC50_nM
0,TEST_000,4.241814
1,TEST_001,1.775264
2,TEST_002,3.941394
3,TEST_003,1.735691
4,TEST_004,6.991902
...,...,...
122,TEST_122,7.678414
123,TEST_123,23.460561
124,TEST_124,13.079676
125,TEST_125,15.508065


In [58]:
submission.to_csv("C:/Users/user/Desktop/dacon_drug_development/LightGBM_2_submission.csv", index=False)