In [5]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import f1_score
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.feature_extraction.text import TfidfVectorizer
import tqdm, sys, os, gc, re, argparse, warnings
warnings.filterwarnings('ignore')

In [6]:
train = pd.read_excel('medicine_data/dataset-new/traindata-new.xlsx')
test = pd.read_excel('medicine_data/dataset-new/testdata-new.xlsx')

# test数据不包含 DC50 (nM) 和 Dmax (%)
train = train.drop(['DC50 (nM)', 'Dmax (%)'], axis=1)

# 去掉明显与结论无关的特征
cols = ['Article DOI']
train = train.drop(cols, axis=1)
test = test.drop(cols, axis=1)

# 定义了一个空列表drop_cols，用于存储在测试数据集中非空值小于10个的列名。
drop_cols = []
for f in test.columns:
    if test[f].notnull().sum() < 10:
        drop_cols.append(f)
        
# 使用drop方法从训练集和测试集中删除了这些列，以避免在后续的分析或建模中使用这些包含大量缺失值的列
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

# 使用pd.concat将清洗后的训练集和测试集合并成一个名为data的DataFrame，便于进行统一的特征工程处理
data = pd.concat([train, test], axis=0, ignore_index=True)
cols = data.columns[2:]

In [3]:
# 将SMILES转换为分子对象列表,并转换为SMILES字符串列表
data['smiles_list'] = data['Smiles'].apply(lambda x:[Chem.MolToSmiles(mol, isomericSmiles=True) for mol in [Chem.MolFromSmiles(x)]])
data['smiles_list'] = data['smiles_list'].map(lambda x: ' '.join(x))  

# 使用TfidfVectorizer计算TF-IDF
tfidf = TfidfVectorizer(max_df = 0.9, min_df = 1, sublinear_tf = True)
res = tfidf.fit_transform(data['smiles_list'])

# 将结果转为dataframe格式
tfidf_df = pd.DataFrame(res.toarray())
tfidf_df.columns = [f'smiles_tfidf_{i}' for i in range(tfidf_df.shape[1])]

# 按列合并到data数据
data = pd.concat([data, tfidf_df], axis=1)

# 自然数编码
def label_encode(series):
    unique = list(series.unique())
    return series.map(dict(zip(
        unique, range(series.nunique())
    )))

for col in cols:
    if data[col].dtype == 'object':
        data[col]  = label_encode(data[col])
        
train = data[data.Label.notnull()].reset_index(drop=True)
test = data[data.Label.isnull()].reset_index(drop=True)

# 特征筛选
features = [f for f in train.columns if f not in ['uuid','Label','smiles_list']]

# 构建训练集和测试集
x_train = train[features]
x_test = test[features]

# 训练集标签
y_train = train['Label'].astype(int)

In [4]:
def cv_model(clf, train_x, train_y, test_x, clf_name, seed=2022):
    
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    # 100， 1 2 3 4 5
    # 1 2 3 4    5
    # 1 2 3 5。  4
    # 1
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} {}************************************'.format(str(i+1), str(seed)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
               
        params = {'learning_rate': 0.1, 'depth': 6, 'l2_leaf_reg': 10, 'bootstrap_type':'Bernoulli','random_seed':seed,
                  'od_type': 'Iter', 'od_wait': 100, 'allow_writing_files': False, 'task_type':'CPU'}

        model = clf(iterations=20000, **params, eval_metric='AUC')
        model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                  metric_period=100,
                  cat_features=[], 
                  use_best_model=True, 
                  verbose=1)

        val_pred  = model.predict_proba(val_x)[:,1]
        test_pred = model.predict_proba(test_x)[:,1]
            
        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(f1_score(val_y, np.where(val_pred>0.5, 1, 0)))
        
        print(cv_scores)
       
    print("%s_score_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test
    
cat_train, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test, "cat")

pd.DataFrame(
    {
        'uuid': test['uuid'],
        'Label': np.where(cat_test>0.5, 1, 0)
    }
).to_csv('submit.csv', index=None)

************************************ 1 2022************************************




0:	test: 0.7630471	best: 0.7630471 (0)	total: 92.9ms	remaining: 30m 58s
100:	test: 0.8956229	best: 0.9023569 (16)	total: 517ms	remaining: 1m 41s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9023569024
bestIteration = 16

Shrink model to first 17 iterations.
[0.8686868686868686]
************************************ 2 2022************************************




0:	test: 0.7300725	best: 0.7300725 (0)	total: 5.01ms	remaining: 1m 40s
100:	test: 0.9456522	best: 0.9456522 (100)	total: 394ms	remaining: 1m 17s
200:	test: 0.9619565	best: 0.9628623 (160)	total: 795ms	remaining: 1m 18s
300:	test: 0.9673913	best: 0.9682971 (269)	total: 1.21s	remaining: 1m 19s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9682971014
bestIteration = 269

Shrink model to first 270 iterations.
[0.8686868686868686, 0.9361702127659574]
************************************ 3 2022************************************




0:	test: 0.7331175	best: 0.7331175 (0)	total: 4.82ms	remaining: 1m 36s
100:	test: 0.9296947	best: 0.9333950 (77)	total: 426ms	remaining: 1m 23s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9333950046
bestIteration = 77

Shrink model to first 78 iterations.
[0.8686868686868686, 0.9361702127659574, 0.8888888888888888]
************************************ 4 2022************************************




0:	test: 0.7751020	best: 0.7751020 (0)	total: 5.8ms	remaining: 1m 55s
100:	test: 0.8938776	best: 0.8946939 (87)	total: 417ms	remaining: 1m 22s
200:	test: 0.8979592	best: 0.8979592 (144)	total: 847ms	remaining: 1m 23s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8979591837
bestIteration = 144

Shrink model to first 145 iterations.
[0.8686868686868686, 0.9361702127659574, 0.8888888888888888, 0.8571428571428571]
************************************ 5 2022************************************




0:	test: 0.7835145	best: 0.7835145 (0)	total: 5.33ms	remaining: 1m 46s
100:	test: 0.9538043	best: 0.9592391 (75)	total: 405ms	remaining: 1m 19s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9592391304
bestIteration = 75

Shrink model to first 76 iterations.
[0.8686868686868686, 0.9361702127659574, 0.8888888888888888, 0.8571428571428571, 0.9278350515463919]
cat_score_list: [0.8686868686868686, 0.9361702127659574, 0.8888888888888888, 0.8571428571428571, 0.9278350515463919]
cat_score_mean: 0.8957447758061928
cat_score_std: 0.03141096432159168
