In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
import numpy as np

def evaluate_morgan_fingerprint(file_path, target_column='pIC50', radius=2, nBits=1024):
    """
    摩根指纹综合评估函数
    参数:
        file_path: 数据文件路径
        target_column: 目标变量列名
        radius: 摩根指纹半径
        nBits: 指纹位数
    """
    # 读取数据
    data = pd.read_csv(file_path)
    smiles = data['Smiles']
    y = data[target_column]

    # 生成摩根指纹
    def morgan_fingerprint(smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
            return [int(bit) for bit in fp.ToBitString()]
        return [0] * nBits

    X = pd.DataFrame([morgan_fingerprint(s) for s in smiles])
    
    # 删除全零特征列
    X = X.loc[:, (X != 0).any(axis=0)]

    # 定义评估模型集合
    models = {
        'SVM': make_pipeline(StandardScaler(), SVR(kernel='rbf')),
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
        'KNN': make_pipeline(StandardScaler(), KNeighborsRegressor())
    }

    # 交叉验证设置
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    metrics = {
        'MSE': make_scorer(mean_squared_error, greater_is_better=False),
        'R2': make_scorer(r2_score)
    }

    # 结果存储
    results = {
        'Model': [],
        'Avg_MSE': [],
        'Avg_R2': [],
        'MSE_Std': [],
        'R2_Std': []
    }

    # 多模型评估
    for model_name, model in models.items():
        mse_scores = []
        r2_scores = []
        
        # 计算两种指标
        for metric_name, metric in metrics.items():
            scores = cross_val_score(
                model, X, y, 
                cv=kf, 
                scoring=metric,
                n_jobs=-1  # 并行计算
            )
            
            if metric_name == 'MSE':
                mse_scores = -scores  # 转换回正数
            else:
                r2_scores = scores

        # 记录结果
        results['Model'].append(model_name)
        results['Avg_MSE'].append(np.mean(mse_scores))
        results['MSE_Std'].append(np.std(mse_scores))
        results['Avg_R2'].append(np.mean(r2_scores))
        results['R2_Std'].append(np.std(r2_scores))

    # 生成报告
    report = pd.DataFrame(results)
    print("="*50)
    print(f"摩根指纹评估报告 (radius={radius}, nBits={nBits})")
    print(f"有效特征维度: {X.shape[1]}")
    print("="*50)
    print(report.round(4))
    print("\n性能总结:")
    print(f"平均MSE: {report['Avg_MSE'].mean():.4f} ± {report['Avg_MSE'].std():.4f}")
    print(f"平均R²: {report['Avg_R2'].mean():.4f} ± {report['Avg_R2'].std():.4f}")
    
    return report

# 使用示例
file_path = 'D:\\Menin\\excelcsv\\menin200more.csv'
morgan_report = evaluate_morgan_fingerprint(
    file_path,
    target_column='pIC50',
    radius=2,
    nBits=1024
)




摩根指纹评估报告 (radius=2, nBits=1024)
有效特征维度: 713
          Model  Avg_MSE  Avg_R2  MSE_Std  R2_Std
0           SVM   0.8895  0.5657   0.1672  0.0849
1  RandomForest   0.5958  0.7100   0.2242  0.1008
2           KNN   0.7499  0.6347   0.2624  0.1337

性能总结:
平均MSE: 0.7451 ± 0.1469
平均R²: 0.6368 ± 0.0722


In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdmolops
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
import numpy as np
import joblib

# 读取CSV文件
file_path = 'D:\\Menin\\excelcsv\\menin200more.csv'
data = pd.read_csv(file_path)

# 提取SMILES和pIC50列
smiles = data['Smiles']
pIC50 = data['pIC50']

# 生成分层指纹（Layered Fingerprint）的正确实现
def smiles_to_layered_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        # 使用RDKFingerprint生成分层指纹
        fp = rdmolops.RDKFingerprint(mol, fpSize=2048)
        return [int(bit) for bit in fp.ToBitString()]
    else:
        return [0] * 2048

# 生成指纹数据集
fingerprints = [smiles_to_layered_fingerprint(smile) for smile in smiles]
X = pd.DataFrame(fingerprints)
y = pIC50

# 检查特征维度
print(f"指纹特征维度: {X.shape[1]}")

# 定义SVM模型和标准化管道
pipeline = make_pipeline(
    StandardScaler(),
    SVR(kernel='rbf', C=1.0, epsilon=0.2)
)

# 定义五折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 自定义评分函数
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

# 进行交叉验证并计算平均MSE和R2
mse_scores = cross_val_score(pipeline, X, y, cv=kf, scoring=mse_scorer)
mean_mse = -np.mean(mse_scores)
print(f'5折交叉验证平均MSE: {mean_mse:.4f}')

r2_scores = cross_val_score(pipeline, X, y, cv=kf, scoring=r2_scorer)
mean_r2 = np.mean(r2_scores)
print(f'5折交叉验证R²: {mean_r2:.4f}')

# 训练最终模型
pipeline.fit(X, y)

# 保存模型
joblib.dump(pipeline, 'D:\\Menin\\joblibpipeline\\svm_model_LayeredFingerprint_pipeline_200more.pkl')


指纹特征维度: 2048
5折交叉验证平均MSE: 0.6153
5折交叉验证R²: 0.6965


['D:\\Menin\\joblibpipeline\\svm_model_LayeredFingerprint_pipeline_200more.pkl']

In [4]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, rdMolDescriptors, Descriptors
from rdkit.Avalon.pyAvalonTools import GetAvalonFP
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
import numpy as np

def generate_features(smiles, fp_type='Morgan', nBits=2048):
    """分子特征生成统一接口"""
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None
    
    # 分子表征生成逻辑
    if fp_type == 'RDKit':
        fp = Chem.RDKFingerprint(mol, fpSize=nBits)
    elif fp_type == 'TopologicalTorsion':
        fp = rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=nBits)
    elif fp_type == 'MACCS':
        fp = AllChem.GetMACCSKeysFingerprint(mol)
    elif fp_type == 'AtomPair':
        fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=nBits)
    elif fp_type == 'Avalon':
        fp = GetAvalonFP(mol, nBits=nBits)
    elif fp_type == 'RDKitDescriptors':
        calculator = MoleculeDescriptors.MolecularDescriptorCalculator([d[0] for d in Descriptors.descList])
        return list(calculator.CalcDescriptors(mol))
    else:
        raise ValueError(f"不支持的分子表征类型: {fp_type}")

    # 处理指纹类型
    if fp_type != 'RDKitDescriptors':
        return [int(bit) for bit in fp.ToBitString()]
    return None

def evaluate_features(file_path, target_col='pIC50', feature_types=None, nBits=2048):
    """分子表征评估框架"""
    
    # 读取数据
    data = pd.read_csv(file_path)
    smiles = data['Smiles']
    y = data[target_col]
    
    # 定义评估模型
    models = {
        'SVM': make_pipeline(StandardScaler(), SVR(kernel='rbf', C=1.0, epsilon=0.2)),
        'RF': RandomForestRegressor(n_estimators=100, random_state=42),
        'KNN': make_pipeline(StandardScaler(), KNeighborsRegressor())
    }
    
    # 结果存储
    results = []
    
    for feat_type in feature_types:
        # 生成特征矩阵
        features = []
        valid_indices = []
        
        for i, s in enumerate(smiles):
            feat = generate_features(s, feat_type, nBits)
            if feat is not None:
                features.append(feat)
                valid_indices.append(i)
        
        X = pd.DataFrame(features)
        y_valid = y.iloc[valid_indices]
        
        # 特殊处理描述符
        if feat_type == 'RDKitDescriptors':
            # 删除高缺失率特征
            X = X.dropna(axis=1, thresh=int(0.5*len(X)))
            # 填充剩余缺失值
            X = X.fillna(X.mean())
        else:
            # 删除全零列
            X = X.loc[:, (X != 0).any(axis=0)]
        
        print(f"\n评估分子表征: {feat_type}")
        print(f"有效特征维度: {X.shape[1]}")
        
        # 交叉验证设置
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        
        # 模型评估
        for model_name, model in models.items():
            try:
                # 计算MSE
                mse_scores = -cross_val_score(
                    model, X, y_valid,
                    cv=kf,
                    scoring='neg_mean_squared_error',
                    n_jobs=-1
                )
                
                # 计算R²
                r2_scores = cross_val_score(
                    model, X, y_valid,
                    cv=kf,
                    scoring='r2',
                    n_jobs=-1
                )
                
                results.append({
                    'FeatureType': feat_type,
                    'Model': model_name,
                    'Avg_MSE': np.mean(mse_scores),
                    'MSE_Std': np.std(mse_scores),
                    'Avg_R2': np.mean(r2_scores),
                    'R2_Std': np.std(r2_scores)
                })
                
            except Exception as e:
                print(f"{feat_type} 在 {model_name} 评估失败: {str(e)}")
    
    # 生成报告
    report = pd.DataFrame(results)
    print("\n================ 综合评估报告 ================")
    print(report.round(4))
    return report

# 使用示例
feature_types = [
    'RDKit',
    'TopologicalTorsion',
    'MACCS',
    'AtomPair',
    'Avalon',
    'RDKitDescriptors'
]

report = evaluate_features(
    file_path='D:\\Menin\\excelcsv\\menin200more.csv',
    target_col='pIC50',
    feature_types=feature_types,
    nBits=2048
)

# 保存结果
report.to_csv('molecular_features_evaluation.csv', index=False)



评估分子表征: RDKit
有效特征维度: 2048





评估分子表征: TopologicalTorsion
有效特征维度: 665

评估分子表征: MACCS
有效特征维度: 131





评估分子表征: AtomPair
有效特征维度: 1833

评估分子表征: Avalon
有效特征维度: 1605





评估分子表征: RDKitDescriptors
有效特征维度: 210

           FeatureType Model  Avg_MSE  MSE_Std  Avg_R2  R2_Std
0                RDKit   SVM   0.6153   0.1596  0.6965  0.0922
1                RDKit    RF   0.5462   0.1528  0.7314  0.0842
2                RDKit   KNN   0.5754   0.1725  0.7155  0.0998
3   TopologicalTorsion   SVM   0.8331   0.1849  0.5926  0.0971
4   TopologicalTorsion    RF   0.6651   0.2550  0.6774  0.1109
5   TopologicalTorsion   KNN   0.7536   0.2225  0.6248  0.1335
6                MACCS   SVM   0.6791   0.1851  0.6665  0.1010
7                MACCS    RF   0.6467   0.1407  0.6822  0.0769
8                MACCS   KNN   0.6829   0.2647  0.6579  0.1548
9             AtomPair   SVM   0.5796   0.1639  0.7150  0.0890
10            AtomPair    RF   0.5331   0.2102  0.7389  0.1093
11            AtomPair   KNN   0.5876   0.1835  0.7103  0.0998
12              Avalon   SVM   0.6370   0.1473  0.6888  0.0780
13              Avalon    RF   0.5669   0.1524  0.7205  0.0801
14              

In [9]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdmolops
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
import numpy as np
import joblib

# 读取CSV文件
file_path = 'D:\\Menin\\excelcsv\\menin200more.csv'
data = pd.read_csv(file_path)

# 提取SMILES和pIC50列
smiles = data['Smiles']
pIC50 = data['pIC50']

# 生成分层指纹（Layered Fingerprint）
def smiles_to_layered_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = rdmolops.RDKFingerprint(mol, fpSize=2048)
        return [int(bit) for bit in fp.ToBitString()]
    else:
        return [0] * 2048

# 生成特征矩阵
fingerprints = [smiles_to_layered_fingerprint(smile) for smile in smiles]
X = pd.DataFrame(fingerprints)
y = pIC50

# 检查特征维度
print(f"指纹特征维度: {X.shape[1]}")

# 定义评估模型集合
models = {
    'SVM': make_pipeline(StandardScaler(), SVR(kernel='rbf', C=1.0, epsilon=0.2)),
    'RF': RandomForestRegressor(n_estimators=100, random_state=42),
    'KNN': make_pipeline(StandardScaler(), KNeighborsRegressor())
}

# 交叉验证设置
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

# 多模型评估
for model_name, model in models.items():
    # 计算MSE
    mse_scores = -cross_val_score(
        model, X, y, 
        cv=kf, 
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    
    # 计算R²
    r2_scores = cross_val_score(
        model, X, y,
        cv=kf,
        scoring='r2',
        n_jobs=-1
    )
    
    # 记录结果
    results.append({
        'Model': model_name,
        'Avg_MSE': np.mean(mse_scores),
        'MSE_Std': np.std(mse_scores),
        'Avg_R2': np.mean(r2_scores),
        'R2_Std': np.std(r2_scores)
    })

# 生成报告
report = pd.DataFrame(results)
print("\n===================== 分层指纹评估报告 =====================")
print(report.round(4))

# 性能总结
print("\n性能总结:")
print(f"平均MSE: {report['Avg_MSE'].mean():.4f} ± {report['Avg_MSE'].std():.4f}")
print(f"平均R²: {report['Avg_R2'].mean():.4f} ± {report['Avg_R2'].std():.4f}")

# 训练最佳模型（根据MSE选择）
best_model_name = report.loc[report['Avg_MSE'].idxmin(), 'Model']
best_model = models[best_model_name]
best_model.fit(X, y)
joblib.dump(best_model, f'D:\\Menin\\joblibpipeline\\best_model_{best_model_name}.pkl')


指纹特征维度: 2048

  Model  Avg_MSE  MSE_Std  Avg_R2  R2_Std
0   SVM   0.6153   0.1596  0.6965  0.0922
1    RF   0.5462   0.1528  0.7314  0.0842
2   KNN   0.5754   0.1725  0.7155  0.0998

性能总结:
平均MSE: 0.5790 ± 0.0347
平均R²: 0.7145 ± 0.0175


['D:\\Menin\\joblibpipeline\\best_model_RF.pkl']

In [None]:
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, KFold, cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
import joblib
import lightgbm as lgb
import numpy as np

# ================== 全局配置 ==================
MODEL_PATH = 'D:\\Menin\\joblibpipeline\\optimized_lgb_pipeline.pkl'
FEATURE_NAMES_PATH = 'D:\\Menin\\joblibpipeline\\feature_names.pkl'

# ================== 数据预处理 ==================
def load_data(file_path):
    """智能加载CSV文件"""
    encodings = ['utf-8-sig', 'gbk', 'latin1']
    for encoding in encodings:
        try:
            data = pd.read_csv(file_path, encoding=encoding)
            if 'Smiles' not in data.columns:
                raise ValueError("CSV文件中必须包含'Smiiles'列")
            return data
        except UnicodeDecodeError:
            continue
    raise ValueError("无法自动检测文件编码，请使用UTF-8或GBK编码保存文件")

# ================== 特征工程 ==================  
def calculate_atom_pair_fingerprint(smiles):
    """带异常处理的指纹生成"""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError("无效的SMILES")
        return list(rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=2048))
    except Exception as e:
        print(f"警告：SMILES '{smiles}' 处理失败: {str(e)}")
        return [0] * 2048

# ================== 模型训练部分保持不变 ==================


# 读取CSV文件
file_path = 'D:\\Menin\\excelcsv\\menin200more.csv'
data = pd.read_csv(file_path)

# 提取SMILES和pIC50列
smiles = data['Smiles']
pIC50 = data['pIC50']

# 生成Atom Pair Fingerprint
def calculate_atom_pair_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=2048)
        return list(fp)
    else:
        return [0] * 2048

# 生成特征矩阵
atom_pair_fps = [calculate_atom_pair_fingerprint(s) for s in smiles]
feature_names = [f'atom_pair_fp_{i}' for i in range(2048)]
X = pd.DataFrame(atom_pair_fps, columns=feature_names)
y = pIC50

# 检查并删除NaN列
X = X.dropna(axis=1)

# ================== 特征选择 ==================
selector = lgb.LGBMRegressor(n_estimators=100, random_state=42)
selector.fit(X, y)
model = SelectFromModel(selector, prefit=True)
X_filtered = model.transform(X)

# ================== 基线模型评估（交叉验证） ==================
baseline_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# 定义评分函数
scoring = {
    'MSE': make_scorer(mean_squared_error, greater_is_better=False),
    'R2': make_scorer(r2_score)
}

# 交叉验证评估
print("\n" + "="*40 + " 基线模型评估 " + "="*40)
baseline_results = cross_validate(
    baseline_model, 
    X_filtered, y,
    cv=cv,
    scoring=scoring,
    return_train_score=False,
    n_jobs=-1
)

# 输出基线模型的MSE和R²
baseline_mse = -np.mean(baseline_results['test_MSE'])
baseline_r2 = np.mean(baseline_results['test_R2'])
print(f"基线模型交叉验证MSE: {baseline_mse:.4f} (±{np.std(-baseline_results['test_MSE']):.4f})")
print(f"基线模型交叉验证R²: {baseline_r2:.4f} (±{np.std(baseline_results['test_R2']):.4f})")

# ================== 超参数优化(网格搜索)（全程交叉验证） ==================
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5],
    'num_leaves': [15, 31],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0, 0.1]
}

grid_search = GridSearchCV(
    estimator=lgb.LGBMRegressor(random_state=42),
    param_grid=param_grid,
    scoring=scoring,
    refit='R2',  # 优化目标为R²
    cv=cv,
    n_jobs=-1,
    verbose=1
)

print("\n" + "="*40 + " 超参数优化 " + "="*40)
grid_search.fit(X_filtered, y)

# ================== 最优模型结果 ==================
best_model = grid_search.best_estimator_
print(f"\n最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证R²: {grid_search.best_score_:.4f}")

# ================== 最终模型性能对比 ==================
# 训练最终模型
best_model.fit(X_filtered, y)

# 交叉验证评估优化模型（确保对比公平性）
optimized_results = cross_validate(
    best_model, 
    X_filtered, y,
    cv=cv,
    scoring=scoring,
    return_train_score=False,
    n_jobs=-1
)

# 输出优化模型的MSE和R²
optimized_mse = -np.mean(optimized_results['test_MSE'])
optimized_r2 = np.mean(optimized_results['test_R2'])
print("\n" + "="*40 + " 性能对比 " + "="*40)
print(f"基线模型交叉验证MSE: {baseline_mse:.4f} (±{np.std(-baseline_results['test_MSE']):.4f})")
print(f"基线模型交叉验证R²: {baseline_r2:.4f} (±{np.std(baseline_results['test_R2']):.4f})")
print(f"优化模型交叉验证MSE: {optimized_mse:.4f} (±{np.std(-optimized_results['test_MSE']):.4f})")
print(f"优化模型交叉验证R²: {optimized_r2:.4f} (±{np.std(optimized_results['test_R2']):.4f})")

# ================== 模型保存 ==================
pipeline = {
    'feature_selector': model,
    'optimized_model': best_model
}
joblib.dump(pipeline, 'D:\\Menin\\joblibpipeline\\optimized_lgb_pipeline.pkl')

# 在训练代码中添加特征名称保存（在模型保存部分之前添加）
import joblib
import os

# 确保目录存在
os.makedirs('D:\\Menin\\joblibpipeline', exist_ok=True)

# 保存特征名称（在训练代码的特征工程部分后添加）
joblib.dump(feature_names, 'D:\\Menin\\joblibpipeline\\feature_names.pkl')

# ================== 增强版预测功能 ==================
class Predictor:
    def __init__(self, model_path=MODEL_PATH):
        # 加载预训练pipeline
        self.pipeline = joblib.load(model_path)
        self.feature_names = joblib.load(FEATURE_NAMES_PATH)
        
    def preprocess_new_data(self, new_smiles_list):
        """新数据预处理管道"""
        # 生成特征
        new_fps = [calculate_atom_pair_fingerprint(s) for s in new_smiles_list]
        X_new = pd.DataFrame(new_fps, columns=self.feature_names)
        
        # 应用特征选择
        return self.pipeline['feature_selector'].transform(X_new)
    
    def batch_predict(self, new_smiles_list):
        """批量预测接口"""
        valid_smiles = []
        predictions = []
        
        for s in new_smiles_list:
            try:
                # 有效性检查
                mol = Chem.MolFromSmiles(s)
                if mol is None:
                    raise ValueError("无效的SMILES")
                    
                # 预处理
                X_processed = self.preprocess_new_data([s])
                
                # 预测
                pred = self.pipeline['optimized_model'].predict(X_processed)[0]
                
                valid_smiles.append(s)
                predictions.append(pred)
            except Exception as e:
                print(f"化合物 {s} 预测失败: {str(e)}")
        
        return pd.DataFrame({
            'Smiles': valid_smiles,
            'Predicted_pIC50': predictions
        })
    
    def predict_from_csv(self, input_file, output_file):
        """端到端预测流程"""
        try:
            # 加载数据
            data = load_data(input_file)
            print(f"成功加载 {len(data)} 个化合物")
            
            # 执行预测
            results = self.batch_predict(data['Smiles'].tolist())
            
            # 合并原始数据
            full_results = pd.merge(data, results, on='Smiles', how='left')
            
            # 保存结果
            full_results.to_csv(output_file, index=False, encoding='utf-8-sig')
            print(f"预测完成，有效预测 {len(results)} 个化合物")
            print(f"结果已保存至: {output_file}")
            
            return full_results
        except Exception as e:
            print(f"预测流程失败: {str(e)}")
            raise

# ================== 使用示例 ==================
if __name__ == "__main__":
    # 初始化预测器
    predictor = Predictor()
    
    # 单次预测
    test_smiles = ['CCO', 'CCN', 'InvalidSMILES']
    print("\n单化合物测试:")
    print(predictor.batch_predict(test_smiles))
    
    # 文件预测
    input_csv = 'D:\\Menin\\excelcsv\\PH.csv'
    output_csv = 'D:\\Menin\\predictions\\optPH.csv'
    
    try:
        results = predictor.predict_from_csv(input_csv, output_csv)
        print("\n前5条预测结果:")
        print(results.head())
    except Exception as e:
        print(f"文件预测失败: {str(e)}")




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002890 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2466
[LightGBM] [Info] Number of data points in the train set: 215, number of used features: 1233
[LightGBM] [Info] Start training from score 6.705628





基线模型交叉验证MSE: 0.3961 (±0.1489)
基线模型交叉验证R²: 0.8051 (±0.0785)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 570
[LightGBM] [Info] Number of data points in the train set: 215, number of used features: 285
[LightGBM] [Info] Start training from score 6.705628

最佳参数: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'num_leaves': 15, 'reg_alpha': 0.1, 'reg_lambda': 0, 'subsample': 0.6}
最佳交叉验证R²: 0.8200
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 570
[LightGBM] [Info] Number of data 

[22:31:12] SMILES Parse Error: syntax error while parsing: InvalidSMILES
[22:31:12] SMILES Parse Error: Failed parsing SMILES 'InvalidSMILES' for input: 'InvalidSMILES'


预测完成，有效预测 35 个化合物
结果已保存至: D:\Menin\predictions\optPH.csv

前5条预测结果:
                                              Smiles  Predicted_pIC50
0  FC(F)(CC1=CC(C(S1)=NC=N2)=C2NC3CCN(CC3)CC4CN(C...         7.493630
1  FC(F)(CC1=CC(C(S1)=NC=N2)=C2NC3CCN(CC3)CC4CN(C...         7.493630
2  FC(F)(CC1=CC(C(S1)=NC=N2)=C2NC3CCN(CC3)CC4CN(C...         7.686397
3  FC(F)(CC1=CC(C(S1)=NC=N2)=C2NC3CCN(CC3)CC4CN(C...         7.686397
4  O=S(C(C=C1)=CC=C1N(C2)CC2CN(CC3)CCC3NC4=NC=NC5...         8.171856


