In [2]:
import pandas as pd
df = pd.read_csv("/data2/project/2025summer/jjh0709/git/Jump-AI-2025/data/merged_pubchem_chembl.csv")

In [9]:
#!/usr/bin/env python3
"""
빠른 상관관계 개선 전략 (기존 코드에 추가)
- 타겟 변환 최적화
- 아웃라이어 제거
- 특성 스케일링 개선
- 앙상블 가중치 최적화
"""

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler, PowerTransformer
from rdkit import Chem
from rdkit.Chem import Descriptors
import xgboost as xgb
import lightgbm as lgb
from scipy.stats import pearsonr
from scipy.optimize import minimize
import warnings
warnings.filterwarnings('ignore')

def compute_safe_descriptors(smiles):
    """기존과 동일한 안전한 기술자 계산"""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [np.nan] * 10
    
    try:
        return [
            Descriptors.MolWt(mol),
            Descriptors.MolLogP(mol),
            Descriptors.TPSA(mol),
            Descriptors.NumRotatableBonds(mol),
            Descriptors.NumHAcceptors(mol),
            Descriptors.NumHDonors(mol),
            Descriptors.NumAromaticRings(mol),
            Descriptors.RingCount(mol),
            Descriptors.NumHeteroatoms(mol),
            Descriptors.BertzCT(mol)
        ]
    except:
        return [np.nan] * 10

def competition_score(y_true, y_pred):
    """대회 평가 점수"""
    try:
        ic50_true = 10 ** (y_true + 6)
        ic50_pred = 10 ** (y_pred + 6)
        
        rmse = np.sqrt(mean_squared_error(ic50_true, ic50_pred))
        A = min(rmse / np.mean(ic50_true), 1)
        
        pic50_true = -y_true
        pic50_pred = -y_pred
        correlation, _ = pearsonr(pic50_true, pic50_pred)
        B = correlation ** 2
        
        score = 0.4 * (1 - A) + 0.6 * B
        return score, A, B
    except:
        return 0.0, 1.0, 0.0

def remove_outliers(X, y, method='iqr', factor=1.5):
    """아웃라이어 제거"""
    if method == 'iqr':
        Q1 = y.quantile(0.25)
        Q3 = y.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        mask = (y >= lower_bound) & (y <= upper_bound)
    else:  # zscore
        z_scores = np.abs((y - y.mean()) / y.std())
        mask = z_scores < factor
    
    return X[mask], y[mask]

def optimize_ensemble_weights(models, X_val, y_val):
    """상관관계 최적화를 위한 앙상블 가중치 계산"""
    
    # 각 모델의 예측값
    predictions = []
    for model in models:
        pred = model.predict(X_val)
        predictions.append(pred)
    
    predictions = np.array(predictions).T  # (n_samples, n_models)
    
    def objective(weights):
        # 가중 평균 예측
        ensemble_pred = np.average(predictions, axis=1, weights=weights)
        
        # pIC50 상관관계 (B 점수)
        pic50_true = -y_val
        pic50_pred = -ensemble_pred
        
        try:
            correlation, _ = pearsonr(pic50_true, pic50_pred)
            return -(correlation ** 2)  # 최대화를 위해 음수
        except:
            return -0.0
    
    # 제약조건: 가중치 합 = 1, 모든 가중치 >= 0
    constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
    bounds = [(0, 1) for _ in range(len(models))]
    
    # 초기값: 균등 가중치
    initial_weights = np.ones(len(models)) / len(models)
    
    result = minimize(objective, initial_weights, method='SLSQP', 
                     bounds=bounds, constraints=constraints)
    
    return result.x if result.success else initial_weights

def create_polynomial_features(X, degree=2, interaction_only=True):
    """다항식 특성 생성 (선택적)"""
    from sklearn.preprocessing import PolynomialFeatures
    
    poly = PolynomialFeatures(degree=degree, interaction_only=interaction_only, 
                             include_bias=False)
    X_poly = poly.fit_transform(X)
    
    return X_poly, poly

# 메인 코드
print("🚀 빠른 상관관계 개선 시작!")

# 1. 데이터 로드
df_train = pd.read_csv("/data2/project/2025summer/jjh0709/git/Jump-AI-2025/data/merged_pubchem_chembl.csv")
df_test = pd.read_csv("/data2/project/2025summer/jjh0709/git/Jump-AI-2025/data/test.csv")

# SMILES 컬럼 찾기
smiles_col = None
for col in df_train.columns:
    if 'smiles' in col.lower():
        smiles_col = col
        break

# 2. 기술자 계산
print("🧪 분자 기술자 계산...")
descriptor_results = []
for smiles in df_train[smiles_col]:
    result = compute_safe_descriptors(smiles)
    descriptor_results.append(result)

descriptor_df = pd.DataFrame(descriptor_results, columns=[
    'MolWt', 'LogP', 'TPSA', 'NumRotatableBonds', 'NumHAcceptors',
    'NumHDonors', 'NumAromaticRings', 'RingCount', 'NumHeteroatoms', 'BertzCT'
])

# 출처 정보
if 'source' in df_train.columns:
    df_train = pd.get_dummies(df_train, columns=["source"])
else:
    df_train['source_chembl'] = 1
    df_train['source_pubchem'] = 0

# 3. 데이터 결합
df_combined = pd.concat([df_train.reset_index(drop=True), descriptor_df], axis=1)

# 4. 특성 선택 및 데이터 준비
features = [
    'MolWt', 'LogP', 'TPSA', 'NumRotatableBonds', 'NumHAcceptors',
    'NumHDonors', 'NumAromaticRings', 'RingCount', 'NumHeteroatoms', 'BertzCT',
    'source_chembl', 'source_pubchem'
]

X = df_combined[features]
y = df_combined['log_IC50']

# NaN 제거
mask = ~(X.isnull().any(axis=1) | y.isnull())
X_clean = X[mask]
y_clean = y[mask]

print(f"데이터 크기: {len(X_clean)}")

# 5. 아웃라이어 제거
print("🧹 아웃라이어 제거...")
X_no_outliers, y_no_outliers = remove_outliers(X_clean, y_clean, method='iqr', factor=2.0)
print(f"아웃라이어 제거: {len(X_clean)} → {len(X_no_outliers)}")

# 6. 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(
    X_no_outliers, y_no_outliers, test_size=0.2, random_state=42
)

# 7. 고급 스케일링
print("📊 고급 데이터 전처리...")
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 8. 타겟 변환 (선택적)
print("🎯 타겟 변환 최적화...")
target_transformer = PowerTransformer(method='yeo-johnson')
y_train_transformed = target_transformer.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_val_transformed = target_transformer.transform(y_val.values.reshape(-1, 1)).ravel()

# 9. 다양한 모델 훈련
print("🤖 고성능 모델 훈련...")

models = {}

# Random Forest (상관관계 최적화)
models['rf'] = RandomForestRegressor(
    n_estimators=800,
    max_depth=25,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

# XGBoost (상관관계 특화)
models['xgb'] = xgb.XGBRegressor(
    n_estimators=800,
    max_depth=10,
    learning_rate=0.05,
    min_child_weight=1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

# LightGBM
models['lgb'] = lgb.LGBMRegressor(
    n_estimators=800,
    max_depth=10,
    learning_rate=0.05,
    num_leaves=100,
    min_child_samples=10,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# Gradient Boosting
models['gb'] = GradientBoostingRegressor(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.05,
    min_samples_split=5,
    min_samples_leaf=2,
    subsample=0.9,
    random_state=42
)

# Extra Trees
from sklearn.ensemble import ExtraTreesRegressor
models['et'] = ExtraTreesRegressor(
    n_estimators=500,
    max_depth=20,
    min_samples_split=3,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# 10. 모델 훈련 (두 가지 타겟으로)
print("🏋️ 모델 훈련 중...")

trained_models_original = {}
trained_models_transformed = {}

for name, model in models.items():
    print(f"  {name} 훈련 중...")
    
    # 원본 타겟으로 훈련
    model_orig = type(model)(**model.get_params())
    model_orig.fit(X_train_scaled, y_train)
    trained_models_original[name] = model_orig
    
    # 변환된 타겟으로 훈련
    model_trans = type(model)(**model.get_params())
    model_trans.fit(X_train_scaled, y_train_transformed)
    trained_models_transformed[name] = model_trans

# 11. 개별 모델 성능 평가
print("\n📊 개별 모델 성능:")

model_scores = {}
model_predictions = {}

for name in models.keys():
    # 원본 타겟 모델
    pred_orig = trained_models_original[name].predict(X_val_scaled)
    score_orig, A_orig, B_orig = competition_score(y_val, pred_orig)
    
    # 변환된 타겟 모델 (역변환)
    pred_trans_raw = trained_models_transformed[name].predict(X_val_scaled)
    pred_trans = target_transformer.inverse_transform(pred_trans_raw.reshape(-1, 1)).ravel()
    score_trans, A_trans, B_trans = competition_score(y_val, pred_trans)
    
    # 더 좋은 점수 선택
    if score_trans > score_orig:
        model_scores[name] = score_trans
        model_predictions[name] = pred_trans
        print(f"  {name} (변환): {score_trans:.4f} (A={A_trans:.3f}, B={B_trans:.3f})")
    else:
        model_scores[name] = score_orig
        model_predictions[name] = pred_orig
        print(f"  {name} (원본): {score_orig:.4f} (A={A_orig:.3f}, B={B_orig:.3f})")

# 12. 상관관계 최적화 앙상블
print("\n🎭 상관관계 최적화 앙상블...")

# 모델 리스트 (성능 순으로 정렬)
sorted_models = sorted(model_scores.items(), key=lambda x: x[1], reverse=True)
top_models = [name for name, score in sorted_models[:4]]  # 상위 4개

print(f"선택된 모델: {top_models}")

# 최적 가중치 계산
selected_models = []
for name in top_models:
    if model_scores[name] > model_scores[name.replace('_transformed', '_original')]:
        selected_models.append(trained_models_transformed[name])
    else:
        selected_models.append(trained_models_original[name])

optimal_weights = optimize_ensemble_weights(selected_models, X_val_scaled, y_val)
print(f"최적 가중치: {dict(zip(top_models, optimal_weights))}")

# 13. 최종 앙상블 예측
ensemble_pred = np.zeros(len(X_val_scaled))
for i, (name, weight) in enumerate(zip(top_models, optimal_weights)):
    pred = model_predictions[name]
    ensemble_pred += weight * pred

# 14. 최종 성능 평가
final_score, final_A, final_B = competition_score(y_val, ensemble_pred)
final_rmse = np.sqrt(mean_squared_error(y_val, ensemble_pred))

print(f"\n🎉 최종 앙상블 성능:")
print(f"   Competition Score: {final_score:.4f}")
print(f"   RMSE: {final_rmse:.4f}")
print(f"   A (Normalized RMSE): {final_A:.4f}")
print(f"   B (Correlation²): {final_B:.4f}")

# 상관관계 세부 분석
pic50_true = -y_val
pic50_pred = -ensemble_pred
correlation, p_value = pearsonr(pic50_true, pic50_pred)
print(f"   pIC50 상관관계: {correlation:.4f} (p={p_value:.3e})")

# 빠른 수정: 기존 코드에서 테스트 데이터 부분만 교체

# 15. 테스트 데이터 예측 (수정된 버전)
print("\n🔮 테스트 데이터 예측 (수정)...")

# 테스트 데이터 기술자 계산
test_descriptors = []
for smiles in df_test[smiles_col]:
    result = compute_safe_descriptors(smiles)
    test_descriptors.append(result)

test_desc_df = pd.DataFrame(test_descriptors, columns=[
    'MolWt', 'LogP', 'TPSA', 'NumRotatableBonds', 'NumHAcceptors',
    'NumHDonors', 'NumAromaticRings', 'RingCount', 'NumHeteroatoms', 'BertzCT'
])

# 출처 정보 추가
test_desc_df['source_chembl'] = 0
test_desc_df['source_pubchem'] = 1

# 훈련에 사용된 특성과 정확히 같은 순서로 데이터 준비
features_ordered = X_train.columns.tolist()  # 훈련 데이터의 컬럼 순서
print(f"특성 순서: {features_ordered}")

# 테스트 데이터를 같은 순서로 정렬
X_test_ordered = test_desc_df[features_ordered].copy()

# 유효한 테스트 데이터만 선택
test_mask = ~X_test_ordered.isnull().any(axis=1)
X_test_clean = X_test_ordered[test_mask]
test_ids = df_test[test_mask]['ID']

print(f"유효한 테스트 데이터: {len(X_test_clean)}/{len(df_test)}")
print(f"테스트 데이터 형태: {X_test_clean.shape}")
print(f"훈련 데이터 형태: {X_train.shape}")

# 컬럼 순서 재확인
if list(X_test_clean.columns) == list(X_train.columns):
    print("✅ 특성 순서 일치")
else:
    print("❌ 특성 순서 불일치 - 강제 정렬")
    X_test_clean = X_test_clean[X_train.columns]

# 스케일링
X_test_scaled = scaler.transform(X_test_clean)

# 앙상블 예측
test_ensemble_pred = np.zeros(len(X_test_scaled))

print("🔮 모델별 예측 중...")
for i, (name, weight) in enumerate(zip(top_models, optimal_weights)):
    print(f"  {name}: 가중치 {weight:.4f}")
    
    if model_scores[name] == max([model_scores[n] for n in [name] if n in model_scores]):
        # 최고 성능 모델 선택 로직 수정
        try:
            # 변환된 모델이 더 좋은 경우
            pred_raw = trained_models_transformed[name].predict(X_test_scaled)
            pred = target_transformer.inverse_transform(pred_raw.reshape(-1, 1)).ravel()
            print(f"    {name}: 변환 모델 사용")
        except:
            # 원본 모델 사용
            pred = trained_models_original[name].predict(X_test_scaled)
            print(f"    {name}: 원본 모델 사용")
    else:
        # 원본 모델 사용
        pred = trained_models_original[name].predict(X_test_scaled)
        print(f"    {name}: 원본 모델 사용")
    
    test_ensemble_pred += weight * pred

# IC50 변환
ic50_pred_nM = 10 ** (test_ensemble_pred + 6)

# 예측값 범위 확인
print(f"\n📊 예측값 통계:")
print(f"  log_IC50 범위: {test_ensemble_pred.min():.3f} ~ {test_ensemble_pred.max():.3f}")
print(f"  IC50 (nM) 범위: {ic50_pred_nM.min():.1f} ~ {ic50_pred_nM.max():.1f}")
print(f"  IC50 (nM) 중간값: {np.median(ic50_pred_nM):.1f}")

# 제출 파일 생성
submission = pd.DataFrame({
    'ID': test_ids,
    'ASK1_IC50_nM': ic50_pred_nM
})

submission.to_csv("/data2/project/2025summer/jjh0709/git/Jump-AI-2025/data/fixed_correlation_submission.csv", index=False)

print(f"\n✅ fixed_correlation_submission.csv 생성!")
print(f"📊 예측값 수: {len(submission)}")

# 성능 요약
print(f"\n" + "="*50)
print("🏆 최종 성능 요약")
print("="*50)
print(f"검증 Competition Score: {final_score:.4f}")
print(f"A (Normalized RMSE): {final_A:.4f}")
print(f"B (Correlation²): {final_B:.4f}")
print(f"pIC50 상관관계: {correlation:.4f}")
print(f"성능 개선: {final_score - 0.4:.4f} (기준 0.4 대비)")

# 모델 기여도
print(f"\n📊 모델 기여도:")
for name, weight in zip(top_models, optimal_weights):
    print(f"  {name}: {weight:.1%}")

print("="*50)
print("✅ 작업 완료!")

🚀 빠른 상관관계 개선 시작!
🧪 분자 기술자 계산...
데이터 크기: 1960
🧹 아웃라이어 제거...
아웃라이어 제거: 1960 → 1959
📊 고급 데이터 전처리...
🎯 타겟 변환 최적화...
🤖 고성능 모델 훈련...
🏋️ 모델 훈련 중...
  rf 훈련 중...
  xgb 훈련 중...
  lgb 훈련 중...
  gb 훈련 중...
  et 훈련 중...

📊 개별 모델 성능:
  rf (변환): 0.4054 (A=1.000, B=0.676)
  xgb (변환): 0.3889 (A=1.000, B=0.648)
  lgb (변환): 0.3973 (A=1.000, B=0.662)
  gb (변환): 0.3863 (A=1.000, B=0.644)
  et (변환): 0.4092 (A=1.000, B=0.682)

🎭 상관관계 최적화 앙상블...
선택된 모델: ['et', 'rf', 'lgb', 'xgb']
최적 가중치: {'et': np.float64(0.7468461281871787), 'rf': np.float64(0.008466055649200789), 'lgb': np.float64(0.2446878161636207), 'xgb': np.float64(0.0)}

🎉 최종 앙상블 성능:
   Competition Score: 0.4104
   RMSE: 0.8591
   A (Normalized RMSE): 1.0000
   B (Correlation²): 0.6840
   pIC50 상관관계: 0.8271 (p=1.308e-99)

🔮 테스트 데이터 예측 (수정)...
특성 순서: ['MolWt', 'MolWt', 'LogP', 'LogP', 'TPSA', 'TPSA', 'NumRotatableBonds', 'NumRotatableBonds', 'NumHAcceptors', 'NumHAcceptors', 'NumHDonors', 'NumAromaticRings', 'RingCount', 'NumHeteroatoms', 'BertzCT', 's

✅ ensemble_stable.csv 생성 완료


In [23]:
#!/usr/bin/env python3
"""
MAP3K5(ASK1) IC50 예측 - 고급 앙상블 최적화
- Multi-model Stacking with Optuna
- Advanced Feature Engineering
- Quantile Matching & Blending
"""

# ======================== 1. 라이브러리 설치 ========================
# !pip install optuna lightgbm xgboost catboost rdkit-pypi -q

# ======================== 2. 필수 라이브러리 ========================
import pandas as pd
import numpy as np
import optuna
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from scipy.stats import rankdata, pearsonr
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, Lipinski, Crippen
import warnings
warnings.filterwarnings('ignore')

# RDKit 경고 메시지 완전 제거
import os
os.environ['RDK_ERROR_STREAM'] = '/dev/null'
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

# Optuna 로깅 레벨 설정
optuna.logging.set_verbosity(optuna.logging.WARNING)

# ======================== 3. 고급 피처 엔지니어링 ========================

def calculate_advanced_features(smiles):
    """확장된 분자 기술자 계산 - 안정화 버전"""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        
        features = {}
        
        # 기본 기술자 (안정적)
        try:
            features['MolWt'] = Descriptors.MolWt(mol)
            features['LogP'] = Descriptors.MolLogP(mol)
            features['TPSA'] = Descriptors.TPSA(mol)
            features['NumRotatableBonds'] = Descriptors.NumRotatableBonds(mol)
            features['NumHAcceptors'] = Descriptors.NumHAcceptors(mol)
            features['NumHDonors'] = Descriptors.NumHDonors(mol)
            features['NumAromaticRings'] = Descriptors.NumAromaticRings(mol)
            features['RingCount'] = Descriptors.RingCount(mol)
            features['NumHeteroatoms'] = Descriptors.NumHeteroatoms(mol)
            features['HeavyAtomCount'] = Descriptors.HeavyAtomCount(mol)
        except:
            pass
        
        # 고급 기술자 (버전별 호환성 체크)
        try:
            features['BertzCT'] = Descriptors.BertzCT(mol)
        except: pass
        
        try:
            features['Chi0'] = Descriptors.Chi0(mol)
            features['Chi1'] = Descriptors.Chi1(mol)
        except: pass
        
        try:
            features['HallKierAlpha'] = Descriptors.HallKierAlpha(mol)
        except: pass
        
        try:
            features['Kappa1'] = Descriptors.Kappa1(mol)
            features['Kappa2'] = Descriptors.Kappa2(mol)
        except: pass
        
        try:
            features['FractionCsp3'] = Descriptors.FractionCsp3(mol)
        except: pass
        
        try:
            features['NumSaturatedRings'] = Descriptors.NumSaturatedRings(mol)
            features['NumAliphaticRings'] = Descriptors.NumAliphaticRings(mol)
        except: pass
        
        try:
            features['MolMR'] = Crippen.MolMR(mol)
        except: pass
        
        try:
            features['BalabanJ'] = Descriptors.BalabanJ(mol)
        except: pass
        
        # VSA 기술자들
        try:
            features['PEOE_VSA1'] = Descriptors.PEOE_VSA1(mol)
            features['PEOE_VSA2'] = Descriptors.PEOE_VSA2(mol)
        except: pass
        
        try:
            features['SMR_VSA1'] = Descriptors.SMR_VSA1(mol)
        except: pass
        
        try:
            features['SlogP_VSA1'] = Descriptors.SlogP_VSA1(mol)
        except: pass
        
        try:
            features['EState_VSA1'] = Descriptors.EState_VSA1(mol)
        except: pass
        
        # 약물성 지표
        try:
            features['QED'] = Descriptors.qed(mol)
        except: pass
        
        try:
            features['NumHeavyAtoms'] = Lipinski.NumHeavyAtoms(mol)
        except: pass
        
        try:
            features['NumAliphaticCarbocycles'] = Lipinski.NumAliphaticCarbocycles(mol)
            features['NumAliphaticHeterocycles'] = Lipinski.NumAliphaticHeterocycles(mol)
            features['NumAromaticCarbocycles'] = Lipinski.NumAromaticCarbocycles(mol)
            features['NumAromaticHeterocycles'] = Lipinski.NumAromaticHeterocycles(mol)
            features['NumSaturatedCarbocycles'] = Lipinski.NumSaturatedCarbocycles(mol)
            features['NumSaturatedHeterocycles'] = Lipinski.NumSaturatedHeterocycles(mol)
        except: pass
        
        # 추가 안정적인 기술자들
        try:
            features['NumRadicalElectrons'] = Descriptors.NumRadicalElectrons(mol)
            features['NumValenceElectrons'] = Descriptors.NumValenceElectrons(mol)
        except: pass
        
        return features if features else None
        
    except Exception as e:
        return None

def get_morgan_fingerprint_features(smiles, radius=2, n_bits=1024):
    """Morgan Fingerprint를 피처로 변환 - 새로운 API 사용"""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)
    
    try:
        # 새로운 API 사용 (RDKit 2022+)
        fp_gen = AllChem.GetMorganGenerator(radius=radius, fpSize=n_bits)
        fp = fp_gen.GetFingerprint(mol)
        return np.array(fp)
    except:
        # 구버전 API 폴백
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
        return np.array(fp)

# ======================== 4. 데이터 로드 및 전처리 ========================

print("📊 데이터 로드 및 전처리 시작...")

# 데이터 로드
df_train = pd.read_csv("/data2/project/2025summer/jjh0709/git/Jump-AI-2025/data/chembl_processed_rescaled.csv")
df_test = pd.read_csv("/data2/project/2025summer/jjh0709/git/Jump-AI-2025/data/test.csv")

# 데이터 클리닝
df_train = df_train[df_train["IC50"] > 0].copy()
df_train = df_train[(df_train["IC50"] >= 0.1) & (df_train["IC50"] <= 1e5)].copy()

# pIC50 계산
df_train["pIC50"] = 9 - np.log10(df_train["IC50"])

# SMILES 컬럼 찾기
smiles_col = 'Smiles' if 'Smiles' in df_train.columns else 'smiles'
smiles_col_test = 'Smiles' if 'Smiles' in df_test.columns else 'smiles'

# 고급 피처 추출 (학습 데이터)
print("🧪 고급 피처 추출 중...")
train_features_list = []
for idx, smiles in enumerate(df_train[smiles_col]):
    if idx % 1000 == 0:
        print(f"  처리 중: {idx}/{len(df_train)}")
    features = calculate_advanced_features(smiles)
    if features:
        train_features_list.append(features)
    else:
        train_features_list.append({})

train_features_df = pd.DataFrame(train_features_list)

# Morgan Fingerprint 추가 (차원 축소)
print("🔬 Morgan Fingerprint 계산...")
n_fp_bits = 256  # 메모리 효율을 위해 축소
train_fp_array = np.array([get_morgan_fingerprint_features(s, n_bits=n_fp_bits) 
                          for s in df_train[smiles_col]])

# PCA로 차원 축소
from sklearn.decomposition import PCA
pca = PCA(n_components=50, random_state=42)
train_fp_pca = pca.fit_transform(train_fp_array)
train_fp_df = pd.DataFrame(train_fp_pca, columns=[f'FP_PC{i+1}' for i in range(50)])

# 모든 피처 결합
X_full = pd.concat([train_features_df, train_fp_df], axis=1)
y_full = df_train["pIC50"]

# NaN 처리
X_full = X_full.fillna(X_full.median())
valid_mask = ~(X_full.isnull().any(axis=1) | y_full.isnull())
X_clean = X_full[valid_mask]
y_clean = y_full[valid_mask]

print(f"✅ 유효 데이터: {len(X_clean)} samples, {X_clean.shape[1]} features")

# ======================== 5. 다중 스케일링 전략 ========================

scalers = {
    'standard': StandardScaler(),
    'robust': RobustScaler(),
    'quantile': QuantileTransformer(output_distribution='normal', random_state=42)
}

X_scaled = {}
for name, scaler in scalers.items():
    X_scaled[name] = scaler.fit_transform(X_clean)
    print(f"  {name} 스케일링 완료")

# 학습/검증 분할
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled['robust'], y_clean, test_size=0.2, random_state=42
)

# ======================== 6. Optuna 다중 모델 최적화 ========================

print("\n🎯 Optuna 하이퍼파라미터 최적화 시작...")

def create_objective(model_type, X_train, y_train, cv_folds=5):
    """각 모델별 Optuna 목적 함수 생성"""
    
    def objective(trial):
        if model_type == 'lgb':
            params = {
                'objective': 'regression',
                'metric': 'rmse',
                'verbosity': -1,
                'n_estimators': 500,  # 줄여서 속도 향상
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'num_leaves': trial.suggest_int('num_leaves', 20, 300),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
            }
            model_class = lgb.LGBMRegressor
            
        elif model_type == 'xgb':
            params = {
                'objective': 'reg:squarederror',
                'n_estimators': 500,  # 줄여서 속도 향상
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
                'gamma': trial.suggest_float('gamma', 0.0, 5.0),
            }
            model_class = xgb.XGBRegressor
            
        elif model_type == 'catboost':
            params = {
                'iterations': 500,  # 줄여서 속도 향상
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'depth': trial.suggest_int('depth', 4, 10),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
                'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
                'random_strength': trial.suggest_float('random_strength', 0.0, 10.0),
                'verbose': False,
            }
            model_class = cb.CatBoostRegressor
            
        elif model_type == 'rf':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 500),
                'max_depth': trial.suggest_int('max_depth', 5, 30),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
                'max_features': trial.suggest_float('max_features', 0.3, 1.0),
                'n_jobs': -1,
                'random_state': 42,
            }
            model_class = RandomForestRegressor
        
        # Cross-validation
        cv = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
        rmse_list = []
        
        for train_idx, val_idx in cv.split(X_train):
            X_fold_train = X_train[train_idx]
            X_fold_val = X_train[val_idx]
            y_fold_train = y_train.iloc[train_idx] if hasattr(y_train, 'iloc') else y_train[train_idx]
            y_fold_val = y_train.iloc[val_idx] if hasattr(y_train, 'iloc') else y_train[val_idx]
            
            model = model_class(**params)
            
            # 간단한 학습 (early stopping 없이)
            model.fit(X_fold_train, y_fold_train)
            
            preds = model.predict(X_fold_val)
            rmse = np.sqrt(mean_squared_error(y_fold_val, preds))
            rmse_list.append(rmse)
        
        return np.mean(rmse_list)
    
    return objective

# 각 모델 최적화 - 속도를 위해 trials 수 줄임
best_params = {}
studies = {}

for model_type in ['lgb', 'xgb', 'catboost', 'rf']:
    print(f"\n  {model_type.upper()} 최적화 중...")
    study = optuna.create_study(direction='minimize')
    study.optimize(
        create_objective(model_type, X_train, y_train),
        n_trials=10,  # 속도를 위해 줄임 (실제로는 50-100 권장)
        show_progress_bar=False
    )
    
    best_params[model_type] = study.best_params
    studies[model_type] = study
    print(f"    Best RMSE: {study.best_value:.4f}")

# ======================== 7. 최적화된 모델 학습 ========================

print("\n🤖 최적화된 모델 학습...")

models = {}

# LightGBM
models['lgb'] = lgb.LGBMRegressor(**best_params['lgb'], n_estimators=1000, verbosity=-1)
try:
    models['lgb'].fit(X_train, y_train, 
                      eval_set=[(X_val, y_val)],
                      callbacks=[lgb.early_stopping(50, verbose=False)])
except:
    # 폴백: callbacks 없이
    models['lgb'].fit(X_train, y_train)

# XGBoost
models['xgb'] = xgb.XGBRegressor(**best_params['xgb'], n_estimators=1000)
try:
    # 새 버전 (XGBoost 2.0+)
    models['xgb'].set_params(early_stopping_rounds=50)
    models['xgb'].fit(X_train, y_train,
                      eval_set=[(X_val, y_val)],
                      verbose=False)
except:
    try:
        # 구 버전
        models['xgb'].fit(X_train, y_train,
                          eval_set=[(X_val, y_val)],
                          early_stopping_rounds=50,
                          verbose=False)
    except:
        # early stopping 없이
        models['xgb'].fit(X_train, y_train)

# CatBoost
models['catboost'] = cb.CatBoostRegressor(**best_params['catboost'], iterations=1000)
try:
    models['catboost'].fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False, early_stopping_rounds=50)
except:
    models['catboost'].fit(X_train, y_train, verbose=False)

# Random Forest
models['rf'] = RandomForestRegressor(**best_params['rf'])
models['rf'].fit(X_train, y_train)

# Extra Trees (고정 파라미터)
models['extra'] = ExtraTreesRegressor(n_estimators=500, max_depth=20, random_state=42, n_jobs=-1)
models['extra'].fit(X_train, y_train)

# Neural Network
models['mlp'] = MLPRegressor(
    hidden_layer_sizes=(256, 128, 64),
    activation='relu',
    solver='adam',
    learning_rate='adaptive',
    max_iter=1000,
    early_stopping=True,
    validation_fraction=0.1,
    random_state=42
)
models['mlp'].fit(X_train, y_train)

# ======================== 8. 모델 평가 및 가중치 최적화 ========================

print("\n📊 모델 성능 평가...")

val_predictions = {}
val_scores = {}

for name, model in models.items():
    pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))
    r2 = r2_score(y_val, pred)
    
    # pIC50 상관관계
    corr, _ = pearsonr(y_val, pred)
    
    val_predictions[name] = pred
    val_scores[name] = {'rmse': rmse, 'r2': r2, 'corr': corr}
    
    print(f"  {name:10s}: RMSE={rmse:.4f}, R²={r2:.4f}, Corr={corr:.4f}")

# 최적 가중치 찾기
from scipy.optimize import minimize

def ensemble_objective(weights):
    ensemble_pred = np.zeros(len(y_val))
    for i, name in enumerate(models.keys()):
        ensemble_pred += weights[i] * val_predictions[name]
    
    rmse = np.sqrt(mean_squared_error(y_val, ensemble_pred))
    return rmse

# 제약조건: 가중치 합 = 1, 모든 가중치 >= 0
constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
bounds = [(0, 1) for _ in range(len(models))]
initial_weights = np.ones(len(models)) / len(models)

result = minimize(ensemble_objective, initial_weights, 
                 method='SLSQP', bounds=bounds, constraints=constraints)

optimal_weights = result.x
print(f"\n✅ 최적 가중치:")
for name, weight in zip(models.keys(), optimal_weights):
    if weight > 0.01:
        print(f"  {name}: {weight:.3f}")

# ======================== 9. 전체 데이터로 재학습 ========================

print("\n🔄 전체 데이터로 모델 재학습...")

# 전체 데이터 스케일링
X_full_scaled = scalers['robust'].fit_transform(X_clean)

models_full = {}

# 각 모델을 전체 데이터로 재학습
for name in models.keys():
    if name == 'lgb':
        models_full[name] = lgb.LGBMRegressor(**best_params['lgb'], n_estimators=1200, verbosity=-1)
    elif name == 'xgb':
        models_full[name] = xgb.XGBRegressor(**best_params['xgb'], n_estimators=1200)
    elif name == 'catboost':
        models_full[name] = cb.CatBoostRegressor(**best_params['catboost'], iterations=1200, verbose=False)
    elif name == 'rf':
        models_full[name] = RandomForestRegressor(**best_params['rf'])
    elif name == 'extra':
        models_full[name] = ExtraTreesRegressor(n_estimators=600, max_depth=20, random_state=42, n_jobs=-1)
    elif name == 'mlp':
        models_full[name] = MLPRegressor(
            hidden_layer_sizes=(256, 128, 64),
            activation='relu',
            max_iter=1500,
            random_state=42
        )
    
    models_full[name].fit(X_full_scaled, y_clean)
    print(f"  {name} 학습 완료")

# ======================== 10. 테스트 데이터 예측 ========================

print("\n🔮 테스트 데이터 예측...")

# 테스트 데이터 피처 추출
test_features_list = []
for idx, smiles in enumerate(df_test[smiles_col_test]):
    if idx % 50 == 0:
        print(f"  처리 중: {idx}/{len(df_test)}")
    features = calculate_advanced_features(smiles)
    if features:
        test_features_list.append(features)
    else:
        test_features_list.append({})

test_features_df = pd.DataFrame(test_features_list)

# Morgan Fingerprint
test_fp_array = np.array([get_morgan_fingerprint_features(s, n_bits=n_fp_bits) 
                          for s in df_test[smiles_col_test]])
test_fp_pca = pca.transform(test_fp_array)
test_fp_df = pd.DataFrame(test_fp_pca, columns=[f'FP_PC{i+1}' for i in range(50)])

# 결합
X_test_full = pd.concat([test_features_df, test_fp_df], axis=1)
X_test_full = X_test_full.fillna(X_test_full.median())

# 학습 데이터와 동일한 컬럼 순서 보장
X_test_full = X_test_full[X_clean.columns]

# 스케일링
X_test_scaled = scalers['robust'].transform(X_test_full)

# 각 모델로 예측
test_predictions = {}
for name, model in models_full.items():
    test_predictions[name] = model.predict(X_test_scaled)
    print(f"  {name} 예측 완료")

# ======================== 11. Quantile Matching & 앙상블 ========================

print("\n🎭 Quantile Matching & 최종 앙상블...")

def quantile_match(source_pred, target_pred):
    """Quantile Matching으로 분포 정렬"""
    sorted_target = np.sort(target_pred)
    source_ranks = rankdata(source_pred, method='ordinal') - 1
    source_ranks = np.clip(source_ranks, 0, len(sorted_target)-1).astype(int)
    return sorted_target[source_ranks]

# RF를 기준으로 다른 모델들 Quantile Matching
base_pred = test_predictions['rf']
matched_predictions = {'rf': base_pred}

for name in ['lgb', 'xgb', 'catboost', 'extra', 'mlp']:
    matched_predictions[name] = quantile_match(test_predictions[name], base_pred)

# 가중 평균 앙상블
ensemble_pred = np.zeros(len(X_test_scaled))
for i, name in enumerate(models.keys()):
    ensemble_pred += optimal_weights[i] * matched_predictions[name]

# ======================== 12. 후처리 및 제출 파일 생성 ========================

# 클리핑
ensemble_pred = np.clip(ensemble_pred, y_clean.min(), y_clean.max())

# IC50 역변환
ic50_pred = 10 ** (9 - ensemble_pred)

# 추가 후처리: 극단값 제한
ic50_pred = np.clip(ic50_pred, 0.1, 100000)

# 제출 파일 생성
submission = pd.DataFrame({
    "ID": df_test["ID"],
    "ASK1_IC50_nM": ic50_pred
})

# 저장 경로 설정
output_dir = "/data2/project/2025summer/jjh0709/git/Jump-AI-2025/submissions/"
import os
os.makedirs(output_dir, exist_ok=True)

# 저장
submission.to_csv(output_dir + "submit_advanced_ensemble.csv", index=False)

print("\n" + "="*60)
print("🎊 예측 완료!")
print("="*60)
print(f"예측 통계:")
print(f"  IC50 범위: {ic50_pred.min():.2f} ~ {ic50_pred.max():.2f} nM")
print(f"  IC50 중간값: {np.median(ic50_pred):.2f} nM")
print(f"  IC50 평균: {np.mean(ic50_pred):.2f} nM")
print(f"  IC50 표준편차: {np.std(ic50_pred):.2f} nM")
print("\n✅ 제출 파일 저장: submit_advanced_ensemble.csv")
print("="*60)

# ======================== 13. 추가: Stacking 앙상블 (선택사항) ========================

print("\n🔥 보너스: Stacking 앙상블...")

from sklearn.ensemble import StackingRegressor

# 베이스 모델들
base_models = [
    ('lgb', lgb.LGBMRegressor(**best_params['lgb'], n_estimators=500, verbosity=-1)),
    ('xgb', xgb.XGBRegressor(**best_params['xgb'], n_estimators=500)),
    ('rf', RandomForestRegressor(**best_params['rf'])),
]

# 메타 모델
meta_model = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1)

# Stacking
stacking = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5)
stacking.fit(X_full_scaled, y_clean)

# Stacking 예측
stacking_pred = stacking.predict(X_test_scaled)
stacking_pred = np.clip(stacking_pred, y_clean.min(), y_clean.max())
ic50_stacking = 10 ** (9 - stacking_pred)
ic50_stacking = np.clip(ic50_stacking, 0.1, 100000)

# Stacking 제출 파일
submission_stacking = pd.DataFrame({
    "ID": df_test["ID"],
    "ASK1_IC50_nM": ic50_stacking
})
submission_stacking.to_csv(output_dir + "submit_stacking.csv", index=False)

print("✅ Stacking 제출 파일 저장: submit_stacking.csv")

# 최종 블렌딩 (앙상블 + Stacking)
final_pred = 0.7 * ic50_pred + 0.3 * ic50_stacking
 
submission_final = pd.DataFrame({
    "ID": df_test["ID"],
    "ASK1_IC50_nM": final_pred
})
submission_final.to_csv(output_dir + "submit_final_blend.csv", index=False)

print("✅ 최종 블렌딩 제출 파일 저장: submit_final_blend.csv")
print("\n🏆 모든 작업 완료!")

In [3]:
import pandas as pd
df = pd.read_csv("/data2/project/2025summer/jjh0709/git/Jump-AI-2025/data/ChEMBL_IC50_30k_preprocessed.csv")
df.head()

Unnamed: 0,ID,Smiles,NumHDonors,TPSA,NumHAcceptors,MolWt,LogP,NumRotatableBonds,RingCount,HeavyAtomCount,IC50
0,CHEMBL324340,Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)...,1.0,100.71,5.0,398.374,4.30202,3.0,5.0,30.0,2500.0
1,CHEMBL109600,COc1ccccc1-c1ccc2oc(-c3ccc(OC)c(N4C(=O)c5ccc(C...,1.0,119.17,7.0,520.497,5.6778,6.0,6.0,39.0,9000.0
2,CHEMBL357278,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4ccc(Cl)c(C(...,2.0,77.93,7.0,543.011,4.27292,8.0,4.0,36.0,4000.0
3,CHEMBL357119,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)NCCc4ccccc4)CC...,2.0,77.93,7.0,468.623,2.32092,10.0,4.0,33.0,17000.0
4,CHEMBL152968,Cc1nc2cc(OC[C@H](O)CN3CCN(CC(=O)Nc4cccc(-c5ccc...,2.0,77.93,7.0,516.667,4.26772,9.0,5.0,37.0,180.0


In [4]:
# 