In [None]:
#0.6836

import numpy as np
import pandas as pd
import xgboost as xgb
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, rdFingerprintGenerator
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

In [2]:
def extract_rdkit_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [0] * 11
    try:
        return [
Descriptors.MolWt(mol),               # 분자량
Descriptors.MolLogP(mol),             # 소수성 (logP)
Descriptors.TPSA(mol),                # 극성 표면적
Descriptors.NumHDonors(mol),          # 수소 결합 donor 수
Descriptors.NumHAcceptors(mol),       # 수소 결합 acceptor 수
Descriptors.NumRotatableBonds(mol),   # 회전 가능한 결합 수
Descriptors.FractionCSP3(mol),        # sp3 탄소 비율
Descriptors.RingCount(mol),           # 고리 수
Descriptors.HeavyAtomCount(mol),      # 수소 제외 원자 수
Descriptors.MolMR(mol),               # 몰 굴절률 (Molar Refractivity)
Descriptors.NumValenceElectrons(mol), # 원자가 전자 수
Descriptors.NumHeteroatoms(mol),      # 이종 원자 수 (C, H 제외한 원자들)
Descriptors.MaxPartialCharge(mol)     # 최대 부분 전하
        ]
    except:
        return [0] * 11

morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

def morgan_fp(smiles, generator=morgan_generator):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(generator.GetNumBits())
    return np.array(generator.GetFingerprint(mol))

In [3]:
train = pd.read_csv("data/train.csv")
train = train[['Canonical_Smiles', 'Inhibition']]

train['RDKit_Features'] = train['Canonical_Smiles'].apply(extract_rdkit_features)
train['Morgan_FP'] = train['Canonical_Smiles'].apply(morgan_fp)

rdkit_feat = np.stack(train['RDKit_Features'].values)
morgan_feat = np.stack(train['Morgan_FP'].values)
train_x = np.hstack((rdkit_feat, morgan_feat))
train_y = train['Inhibition'].astype(float).values

In [4]:
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)

In [5]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1]
}

xgb_model = xgb.XGBRegressor(random_state=42, verbosity=0, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')

In [6]:
grid = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=1)
grid.fit(train_x_scaled, train_y)

best_model = grid.best_estimator_
print("Best parameters found: ", grid.best_params_)

Fitting 3 folds for each of 972 candidates, totalling 2916 fits
Best parameters found:  {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.8}


In [7]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(best_model, train_x_scaled, train_y, cv=cv, scoring='neg_root_mean_squared_error')
print("Cross-validated RMSE: ", -np.mean(scores))

Cross-validated RMSE:  23.86899250107938


In [8]:
test = pd.read_csv("data/test.csv")
test = test[['ID', 'Canonical_Smiles']]
test['RDKit_Features'] = test['Canonical_Smiles'].apply(extract_rdkit_features)
test['MorganFP'] = test['Canonical_Smiles'].apply(morgan_fp)

test_rdkit_feat = np.stack(test['RDKit_Features'].values)
test_morgan_feat = np.stack(test['MorganFP'].values)
test_x = np.hstack([test_rdkit_feat, test_morgan_feat])
test_x_scaled = scaler.transform(test_x)

In [9]:
test_pred = best_model.predict(test_x_scaled)

In [10]:
submit = pd.read_csv("data/sample_submission.csv")
submit['Inhibition'] = test_pred
submit.to_csv("enhance_v2_submit.csv", index=False)
print("Submission saved to 'enhance_v2_submit.csv'")

Submission saved to 'enhance_v2_submit.csv'
