In [1]:
# 选择镜像：unimol-qsar:v0.5, 机型选择GPU
# 导入unimol
from unimol import UniMolRepr
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


### 多分子输入做大指纹

In [None]:
clf = UniMolRepr(data_type='molecule', remove_hs=False)

# Load dataset
filename = 'truncated_MON.csv'
df = pd.read_csv(filename)
y = df['TARGET'].to_list()

# SMILES column names
smiles_fields = ['SMILES_1', 'SMILES_2', 'SMILES_3', 'SMILES_4']

# Initialize a new dataframe
big_fingerprint_df = pd.DataFrame()

# Initialize DataFrame, filled with NaN
# Assume that each SMILES generated representation has 512 dimensions
num_repr_dimensions = 512
big_fingerprint_df = pd.DataFrame(np.nan, index=np.arange(len(df)), columns=np.arange(len(smiles_fields) * num_repr_dimensions))

# for each SMILES column
for field_idx, field in enumerate(smiles_fields):
    smiles_list = df[field].to_list()
    
    # if SMILES col is not empty
    non_empty_smiles = [smiles for smiles in smiles_list if pd.notna(smiles) and smiles != '']
    if non_empty_smiles:
        repr_dict = clf.get_repr(non_empty_smiles)
        unimol_repr_list = np.array(repr_dict['cls_repr'])
        
        # columns range which should be processed
        col_start = field_idx * num_repr_dimensions
        col_end = (field_idx + 1) * num_repr_dimensions
        
        # Only update rows corresponding to non-empty SMILES
        for i, smiles in enumerate(smiles_list):
            if smiles in non_empty_smiles:
                big_fingerprint_df.iloc[i, col_start:col_end] = unimol_repr_list[non_empty_smiles.index(smiles)]


# Append到大指纹DataFrame
big_fingerprint_df['TARGET'] = y
big_fingerprint_df.to_csv('Uni-fingerprint_'+filename)

### 多分子带权指纹

In [2]:
clf = UniMolRepr(data_type='molecule', remove_hs=False)

# Load dataset
filename = 'truncated_MON.csv'
df = pd.read_csv(filename)
y = df['TARGET'].to_list()

# SMILES and Ratio column names
smiles_fields = ['SMILES_1', 'SMILES_2', 'SMILES_3', 'SMILES_4']
ratio_fields = ['Ratio_0', 'Ratio_1', 'Ratio_2', 'Ratio_3']  # Corresponding ratio fields

# Initialize DataFrame, filled with NaN
# Assume that each SMILES generated representation has 512 dimensions
num_repr_dimensions = 512
big_fingerprint_df = pd.DataFrame(np.nan, index=np.arange(len(df)), columns=np.arange(len(smiles_fields) * num_repr_dimensions))

# for each (SMILES column and its Ratio) column (pair)s
for field_idx, (smiles_field, ratio_field) in enumerate(zip(smiles_fields, ratio_fields)):
    smiles_list = df[smiles_field].to_list()
    ratio_list = df[ratio_field].to_list()  # 获取对应的加权比例列表
    
    # if SMILES col is not empty
    non_empty_smiles = [(smiles, ratio) for smiles, ratio in zip(smiles_list, ratio_list) if pd.notna(smiles) and smiles != '']
    if non_empty_smiles:
        # Separate the SMILES and its ratio
        valid_smiles, valid_ratios = zip(*non_empty_smiles)
        repr_dict = clf.get_repr(list(valid_smiles))
        unimol_repr_list = np.array(repr_dict['cls_repr'])
        
        # columns range which should be processed
        col_start = field_idx * num_repr_dimensions
        col_end = (field_idx + 1) * num_repr_dimensions
        
        # Update non-empty SMILES and apply weighted ratios
        for i, (smiles, ratio) in enumerate(zip(smiles_list, ratio_list)):
            if smiles in valid_smiles:
                weighted_repr = unimol_repr_list[valid_smiles.index(smiles)] * ratio  # 应用加权
                big_fingerprint_df.iloc[i, col_start:col_end] = weighted_repr

# Append到大指纹DataFrame
big_fingerprint_df /= 100.0
big_fingerprint_df['TARGET'] = y
big_fingerprint_df.to_csv('Uni-fingerprint_'+filename)

2024-02-29 22:48:14 | unimol/models/unimol.py | 116 | INFO | Uni-Mol(QSAR) | Loading pretrained weights from /opt/conda/lib/python3.8/site-packages/unimol-0.0.2-py3.8.egg/unimol/weights/mol_pre_all_h_220816.pt
2024-02-29 22:48:17 | unimol/data/conformer.py | 62 | INFO | Uni-Mol(QSAR) | Start generating conformers...
33it [00:00, 137.63it/s]
2024-02-29 22:48:17 | unimol/data/conformer.py | 66 | INFO | Uni-Mol(QSAR) | Failed to generate conformers for 0.00% of molecules.
2024-02-29 22:48:17 | unimol/data/conformer.py | 68 | INFO | Uni-Mol(QSAR) | Failed to generate 3d conformers for 0.00% of molecules.
100%|██████████| 2/2 [00:00<00:00,  4.17it/s]
2024-02-29 22:48:18 | unimol/data/conformer.py | 62 | INFO | Uni-Mol(QSAR) | Start generating conformers...
33it [00:00, 142.55it/s]
2024-02-29 22:48:18 | unimol/data/conformer.py | 66 | INFO | Uni-Mol(QSAR) | Failed to generate conformers for 0.00% of molecules.
2024-02-29 22:48:18 | unimol/data/conformer.py | 68 | INFO | Uni-Mol(QSAR) | Faile

## XGBoost给出预测结果

In [36]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import torch

# 分割数据为特征集(X)和目标变量(y)
big_fingerprint_df = pd.read_csv('Uni-fingerprint_truncated_MON.csv')
X = big_fingerprint_df.drop(['TARGET'], axis=1)
y = big_fingerprint_df['TARGET']

# 分割数据为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化XGBoost回归模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'{device} is available.')
xg_reg = xgb.XGBRegressor(
    n_estimators = 400, objective ='reg:squarederror', colsample_bytree = 0.3, 
    learning_rate = 0.03, max_depth = 5, alpha = 10,
    tree_method='gpu_hist' if device.type == 'cuda' else 'auto'
    )

# 训练模型
xg_reg.fit(X_train, y_train)

# 预测测试集
y_pred = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("RMSE: %f" % (rmse))

# 使用整个数据集进行预测
y_pred_full = xg_reg.predict(X)
df['pred_y'] = y_pred_full
df.to_csv('XGB_truncated_MON.csv')

cuda is available.
RMSE: 2.433807


In [4]:
import pandas as pd
import numpy as np

# 读取数据集
data = pd.read_csv('XGB_test_MON.csv')  # 请将'your_dataset.csv'替换为你的数据集文件路径

# 提取QSPR_pred列和TARGET列
QSPR_pred = data['QSPR_pred']
TARGET = data['TARGET']

# 计算均方根误差（RMSE）
rmse = np.sqrt(((QSPR_pred - TARGET) ** 2).mean())

print("RMSE:", rmse)


RMSE: 1.5633347033037381
