In [106]:
import pandas as pd
import numpy as np
import openpyxl
import os
import deepchem as dc
from sklearn.preprocessing import MinMaxScaler
# from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_squared_log_error, mean_absolute_error, r2_score
import sklearn.metrics


In [107]:
def get_brainblood_csv(workbookpath, csvfilepath):
    excel_df = pd.read_excel(workbookpath, index_col=[0, 1], engine='openpyxl')
    # column_list = excel_df.columns.to_list()
    # print(column_list)
    blood_df = excel_df.loc[:, excel_df.columns.str.startswith('blood mean')]
    brain_df = excel_df.loc[:, excel_df.columns.str.startswith('brain mean')]
    print(blood_df.columns.to_list())
    print(brain_df.columns.to_list())
    df = pd.concat([blood_df, brain_df], axis=1)
    df.to_csv(csvfilepath, encoding='utf-8')

In [108]:
def calculate_blood_brain_ratio(raw_csvfilepath, ratio_csvfilepath):
    raw_df = pd.read_csv(raw_csvfilepath, index_col=[0, 1])
    blood_df = raw_df.loc[:, raw_df.columns.str.startswith('blood mean')]
    brain_df = raw_df.loc[:, raw_df.columns.str.startswith('brain mean')]
    # 以{(化合物文献号，SMILE) -> {浓度数据}}的格式存储数据
    compound_ratio = dict()
    for index, blood_row_data in blood_df.iterrows():
        # 血液行数据
        blood_row_data = blood_row_data.dropna()
        # 脑部行数据
        brain_row_data = brain_df.loc[index[0]].dropna(axis=1, how='all')
        # 任意一个器官内数据为空，跳过
        if brain_row_data.empty or blood_row_data.empty:
            continue
        else:
            # 以{(时间) -> (脑血浓度比)}的格式存储数据
            ratio2time = dict()
            # 转换series为dataframe
            blood_row_data = blood_row_data.to_frame()
            blood_row_data = pd.DataFrame(blood_row_data.values.T, columns=blood_row_data.index)

            for column in blood_row_data.columns.to_list():
                # 获取血液浓度
                blood_num = float(blood_row_data[column].values[0])
                # 拆分列头以获取时间点，组合成脑部浓度数据时间点
                tgt_col = 'brain ' + column.split(" ")[1]
                # 判断该脑部数据时间点是否存在
                if tgt_col in brain_row_data.columns.to_list():
                    # 获取脑部浓度
                    brain_num = float(brain_row_data[tgt_col].values[0])
                    brainbloodratio = brain_num / blood_num
                    # 按照脑部浓度、血液浓度和脑血浓度比3种数据以列表格式保存到字典中
                    ratio2time[column.split(" ")[1].replace('mean', '')] = [brain_num, blood_num, brainbloodratio]
        # kv[1][2]指定为以脑血浓度比进行降序排序
        sorted_data = sorted(ratio2time.items(), key=lambda kv: (kv[1][2], kv[0]), reverse=True)
        # print(sorted_data)
        # 获取最大脑血浓度比的数据
        compound_ratio[index] = sorted_data[0]
    # 将字典转换成Dataframe所需的列表格式
    max_ratio_list = []
    for key, value in compound_ratio.items():
        index = key[0]
        smiles = key[1]
        time = value[0]
        brain_num = value[1][0]
        blood_num = value[1][1]
        ratio = value[1][2]
        max_ratio_list.append([index, smiles, brain_num, blood_num, ratio, time])
    df = pd.DataFrame(data=max_ratio_list, columns=['Compound index', 'SMILES', 'Brain', 'Blood', 'Brain/Blood', 'Reach time'])

    #     # 降序排序并获取第一个最大值
    #     compound_ratio[index] = sorted(ratio2time.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)[0]
    # # 将字典转换成Dataframe所需的列表格式
    # max_ratio_list = []
    # for key, value in compound_ratio.items():
    #     index = key[0]
    #     smiles = key[1]
    #     time = value[0]
    #     ratio = value[1]
    #     max_ratio_list.append([index, smiles, ratio, time])
    # df = pd.DataFrame(data=max_ratio_list, columns=['Compound index', 'SMILES', 'Max(Brain/Blood)', 'Reach time'])
    # print(df)
    df.to_csv(ratio_csvfilepath, index=False)

In [109]:
def calculate_desc(srcfile, dstfile):
    df = pd.read_csv(srcfile)
    featurizer = dc.feat.MordredDescriptors(ignore_3D=True)
    SMILES = df['SMILES']
    X = []
    for smiles in SMILES:
        X.append(featurizer.featurize(smiles)[0])
    blood = df['Blood']
    brain = df['Brain']
    ratio = df['Brain/Blood']
    df = pd.DataFrame(data=X)
    df.insert(0, 'SMILES', SMILES)
    df.insert(1, 'Blood', blood)
    df.insert(2, 'Brain', brain)
    df.insert(3, 'Ratio', ratio)
    df.to_csv(dstfile, index=False)

def get_X_Y(csvfile):
    df = pd.read_csv(csvfile)
    X = df.drop(['SMILES', 'Blood', 'Brain', 'Ratio'], axis=1)
    X = MinMaxScaler().fit_transform(X)
    # print(len(X))
    blood_y = df['Blood'].ravel()
    brain_y = df['Brain'].ravel()
    ratio_y = df['Ratio'].ravel()
    SMILES = df['SMILES']
    return pd.DataFrame(X), blood_y, brain_y, ratio_y, SMILES


In [110]:
filetime = "20221221"
# 原始的集成数据集
workbookpath = f"./result/{filetime}/数据表汇总.xlsx"
# 从原始数据集中挑选出脑部与血液浓度的数据集
raw_csvfilepath = f"./result/{filetime}/BrainBlood.csv"
# 计算得到最大脑血比的数据集
ratio_csvfilepath = f"./result/{filetime}/MaxBrainBloodRatio.csv"
# 计算出药物的Mordred描述符以及最大脑血比的数据集
desc_csvfilepath = f"./result/{filetime}/RatioDescriptors.csv"
generate_new_data = [False, False, False]
regressor_type = 'LGBM'

print("Running...")
if not os.path.exists(raw_csvfilepath) or generate_new_data[0]:
    print("Getting blood brain file...")
    get_brainblood_csv(workbookpath, raw_csvfilepath)

if not os.path.exists(ratio_csvfilepath) or generate_new_data[1]:
    print("Calculating blood brain ratio...")
    calculate_blood_brain_ratio(raw_csvfilepath, ratio_csvfilepath)

if not os.path.exists(desc_csvfilepath) or generate_new_data[2]:
    print("Calculating descriptors...")
    calculate_desc(ratio_csvfilepath, desc_csvfilepath)

X, blood_y, brain_y, ratio_y, SMILES = get_X_Y(desc_csvfilepath)
feature_select = True
if feature_select:
    # 特征筛选
    # blood_fea = [162, 222, 254, 255, 261, 300, 320, 325, 338, 369, 396, 441, 446, 474, 481, 489, 502, 514, 529, 530, 541, 549, 565, 568, 570, 582, 594, 598, 602, 631, 632, 638, 645, 646, 648, 802, 807, 832, 986, 1145, 1226, 1232, 1266, 1287, 1289, 1297, 1316, 1356, 1539, 1544]
    blood_fea = [5, 10, 60, 76, 111, 129, 132, 142, 150, 151, 162, 164, 168, 183, 184, 186, 196, 203, 220, 222, 223, 230, 231, 243, 246, 247, 254, 255, 261, 264, 270, 289, 300, 310, 311, 320, 323, 325, 333, 334, 337, 338, 350, 359, 368, 369, 396, 397, 404, 405, 414, 422, 441, 446, 449, 452, 453, 458, 465, 466, 474, 481, 489, 497, 498, 502, 506, 513, 514, 522, 529, 530, 534, 540, 541, 542, 546, 549, 561, 565, 566, 567, 568, 570, 572, 581, 582, 589, 594, 597, 598, 599, 600, 602, 604, 610, 618, 626, 629, 630, 631, 632, 637, 638, 642, 644, 645, 646, 648, 665, 781, 788, 791, 802, 803, 807, 814, 816, 825, 832, 838, 846, 847, 848, 978, 981, 986, 987, 999, 1055, 1057, 1065, 1072, 1086, 1134, 1136, 1139, 1143, 1144, 1145, 1156, 1157, 1161, 1163, 1165, 1209, 1211, 1217, 1219, 1221, 1226, 1228, 1232, 1235, 1236, 1238, 1245, 1249, 1250, 1252, 1253, 1257, 1259, 1262, 1263, 1266, 1278, 1281, 1282, 1286, 1287, 1289, 1297, 1301, 1306, 1309, 1311, 1314, 1319, 1325, 1328, 1330, 1333, 1337, 1356, 1357, 1358, 1360, 1362, 1377, 1382, 1523, 1524, 1539, 1540, 1541, 1543, 1544, 1545, 1547]
    # brain_fea = [3, 40, 150, 164, 243, 246, 254, 255, 261, 310, 342, 368, 369, 449, 450, 458, 497, 506, 529, 542, 549, 578, 602, 604, 610, 618, 637, 642, 644, 646, 770, 781, 801, 814, 846, 986, 999, 1065, 1078, 1136, 1143, 1157, 1278, 1316, 1329, 1330, 1336, 1543, 1545, 1547]
    brain_fea = [3, 10, 36, 38, 39, 40, 57, 60, 69, 75, 76, 78, 83, 123, 136, 138, 141, 142, 143, 146, 150, 162, 163, 164, 168, 178, 186, 194, 195, 212, 213, 214, 222, 230, 232, 243, 246, 254, 255, 261, 264, 270, 289, 310, 319, 325, 333, 337, 338, 341, 342, 350, 368, 369, 396, 414, 423, 441, 446, 449, 450, 452, 457, 458, 465, 466, 474, 481, 497, 498, 506, 514, 522, 529, 530, 534, 541, 542, 546, 549, 561, 566, 567, 568, 570, 572, 574, 578, 581, 589, 597, 598, 602, 604, 610, 618, 626, 630, 631, 632, 637, 638, 642, 644, 645, 646, 648, 665, 770, 780, 781, 791, 801, 807, 812, 814, 815, 816, 825, 831, 835, 845, 846, 981, 986, 999, 1055, 1060, 1065, 1077, 1078, 1084, 1086, 1136, 1139, 1143, 1144, 1151, 1157, 1162, 1163, 1165, 1209, 1211, 1217, 1219, 1221, 1226, 1228, 1232, 1233, 1234, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1253, 1256, 1259, 1266, 1278, 1281, 1285, 1286, 1287, 1289, 1297, 1302, 1306, 1314, 1316, 1319, 1325, 1329, 1330, 1336, 1339, 1352, 1356, 1357, 1360, 1362, 1376, 1523, 1524, 1532, 1533, 1538, 1539, 1540, 1541, 1543, 1544, 1545, 1546, 1547, 1548]
    blood_X = X.iloc[:, blood_fea]
    brain_X = X.iloc[:, brain_fea]
else:
    blood_X = X
    brain_X = X

print("Start training model...")


Running...
Start training model...


In [111]:
def train_model(X, y, model, cv_times=5, callback=None):
    cv = KFold(n_splits=cv_times, shuffle=True)

    r2_scores = np.empty(cv_times)
    rmse_scores = np.empty(cv_times)
    
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model.fit(X_train, y_train, eval_set=[
            (X_test, y_test)], callbacks=callback) # early_stopping_rounds=100, verbose=False,
        preds = model.predict(X_test)

        r2 = r2_score(y_test, preds)
        r2_scores[idx] = r2

        rmse = np.sqrt(mean_squared_error(y_test, preds))
        rmse_scores[idx] = rmse
    # print(cv_scores)
    # print("R2 Scores: %0.4f (+/- %0.2f)" %
    #     (r2_scores.mean(), r2_scores.std()))
    # print("RMSE Scores: %0.4f (+/- %0.2f)" %
    #     (rmse_scores.mean(), rmse_scores.std()))
    # score = cross_val_score(clf, X, y, cv=cv_times, scoring='r2')
    return r2_scores, rmse_scores


In [112]:
"""params = {
	'n_estimators': 1850,
	'learning_rate': 0.016,
	'max_depth': 11,
	'lambda': 0.9550839455019401,
	'alpha': 7.1136077576753936,
	'min_child_weight': 16,
	'gamma': 7,
	'colsample_bytree': 0.1,
	'colsample_bylevel': 1.0,
	'colsample_bynode': 1.0,

	R2 Scores: 0.5456 (+/- 0.20)
	RMSE Scores: 108.6056 (+/- 25.18)
}"""

"""
Blood data:
R2 Scores: 0.3749 (+/- 0.29)
RMSE Scores: 4.1023 (+/- 1.46)
Brain data:
R2 Scores: 0.5248 (+/- 0.27)
RMSE Scores: 56.1579 (+/- 29.28)
"""
if regressor_type != 'LGBM':
	blood_params = {
		'n_estimators': 1850,
		'learning_rate': 0.016,
		'max_depth': 30,
		'lambda': 0.21396204986950074,
		'alpha': 9.722159852062028,
		'min_child_weight': 9,
		'gamma': 0,
		'colsample_bytree': 0.8,
		'colsample_bylevel': 0.9,
		'colsample_bynode': 0.3,
	}

	brain_params = {
		'n_estimators': 1400,
		'learning_rate': 0.03,
		'max_depth': 2,
		'lambda': 0.07718586294091904,
		'alpha': 2.5070868940026005,
		'min_child_weight': 6,
		'gamma': 1,
		'colsample_bytree': 0.1,
		'colsample_bylevel': 0.9,
		'colsample_bynode': 0.7,
	}
else:	# LightGBM
	blood_params = {
		'boosting_type': 'dart',
		'max_depth': 27,
		'learning_rate': 0.012,
		'n_estimators': 1950,
		'objective': 'regression',
		'min_child_samples': 7,
		'reg_lambda': 8.742591593419672,
		'reg_alpha': 0.03613741302435425,
	}

	brain_params = {
		'boosting_type': 'gbdt',
		'max_depth': 16,
		'learning_rate': 0.025,
		'n_estimators': 2550,
		'objective': 'regression',
		'min_child_samples': 28,
		'reg_lambda': 0.08538775926146094,
		'reg_alpha': 0.07632249600835359,
	}

if regressor_type != 'LGBM':
	blood_model = XGBRegressor(**blood_params)
	blood_r2_scores, blood_rmse_scores = train_model(blood_X, blood_y, blood_model)

	brain_model = XGBRegressor(**brain_params)
	brain_r2_scores, brain_rmse_scores = train_model(brain_X, brain_y, brain_model)
else:
	callbacks = [lgb.log_evaluation(period=0)]
	blood_model = lgb.sklearn.LGBMRegressor(**blood_params)
	blood_r2_scores, blood_rmse_scores = train_model(blood_X, blood_y, blood_model, callback=callbacks)

	brain_model = lgb.sklearn.LGBMRegressor(**brain_params)
	brain_r2_scores, brain_rmse_scores = train_model(brain_X, brain_y, brain_model, callback=callbacks)

print("Blood data:")
print("R2 Scores: %0.4f (+/- %0.2f)" %
    (blood_r2_scores.mean(), blood_r2_scores.std()))
print("RMSE Scores: %0.4f (+/- %0.2f)" %
    (blood_rmse_scores.mean(), blood_rmse_scores.std()))

print("Brain data:")
print("R2 Scores: %0.4f (+/- %0.2f)" %
    (brain_r2_scores.mean(), brain_r2_scores.std()))
print("RMSE Scores: %0.4f (+/- %0.2f)" %
    (brain_rmse_scores.mean(), brain_rmse_scores.std()))
# cv_times = 5
# cv = KFold(n_splits=cv_times, shuffle=True)

# r2_scores = np.empty(cv_times)
# rmse_scores = np.empty(cv_times)
# for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = y[train_idx], y[test_idx]

#     model = XGBRegressor(**params)
#     model.fit(X_train, y_train, eval_set=[
#         (X_test, y_test)], early_stopping_rounds=100, verbose=False)
#     preds = model.predict(X_test)

#     r2 = r2_score(y_test, preds)
#     r2_scores[idx] = r2

#     rmse = np.sqrt(mean_squared_error(y_test, preds))
#     rmse_scores[idx] = rmse
# # print(cv_scores)
# print("R2 Scores: %0.4f (+/- %0.2f)" %
#       (r2_scores.mean(), r2_scores.std()))
# print("RMSE Scores: %0.4f (+/- %0.2f)" %
#       (rmse_scores.mean(), rmse_scores.std()))


# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

# # smaller, better
# MSE = mean_squared_error(y_test, y_pred)
# # MSLE = mean_squared_log_error(y_test, y_pred)
# MAE = mean_absolute_error(y_test, y_pred)
# Median = median_absolute_error(y_test, y_pred)
# # closer to 1 means better
# r2 = r2_score(y_test, y_pred)
# print("MSE: ", MSE)
# print("RMSE: ", np.sqrt(MSE))
# # print("MSLE: ", MSLE)
# print("MAE: ", MAE)
# print("Median: ", Median)
# print("r2: ", r2)

# ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_root_mean_squared_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted', 'top_k_accuracy', 'v_measure_score']


Blood data:
R2 Scores: 0.4404 (+/- 0.15)
RMSE Scores: 4.6320 (+/- 1.34)
Brain data:
R2 Scores: -21.4640 (+/- 43.60)
RMSE Scores: 72.3808 (+/- 22.98)
