In [1]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)
#接近完美

In [2]:
# desc_data=pd.read_excel("Molecular_Descriptor.xlsx")
#admet_data=pd.read_excel("化合物ADMET.xlsx")
# full_df=pd.merge(desc_data,admet_data,on='SMILES')
# full_df.to_csv("MD_to_ADMET.csv",index=False)

In [3]:
data_df = pd.read_csv("MD_to_ADMET.csv")
data_df.shape

(1974, 735)

In [4]:
target_label = ['Caco-2', 'CYP3A4', 'hERG', 'HOB', 'MN','SMILES']

In [5]:
X = data_df.drop(labels=target_label,axis=1)
y = data_df['HOB']

In [6]:
feature_names = X.columns

### 1.分类器准备 XGboost RF随机森林 lightGBM

### 1.1 数据划分

In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

#为了减少误差
# std1 = MinMaxScaler()
# X=std1.fit_transform(X)
# std2 = MinMaxScaler()
# y=std2.fit_transform(y.values.reshape(-1,1))

#训练集 和 验证集 9:1   ;仅仅是分类器
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=666) #这里改名字
#pd.DataFrame(x_train).head()

### 1.2 评价指标

In [8]:
#1.2 评价指标
from sklearn.metrics import confusion_matrix,f1_score,recall_score,accuracy_score,precision_score

def print_score(model,x_train,y_train,x_test,y_test):
    print(model.score(x_train,y_train))
    print(model.score(x_test,y_test))

#误报率的计算
def false_alarm_rate(cm):
    tn, fp, fn, tp = cm.ravel()
    far=fp/(fp+tn)#误报率
    return far

#返回一串字典 输入真实值和预测值
def calc_metrics(true, pred):
    #传进来都是df,查准率precison不关注
    cm=confusion_matrix(true, pred)
    far=false_alarm_rate(cm)*100
    accuracy=accuracy_score(true, pred)*100
    recall=recall_score(true, pred)*100
    f1= f1_score(true, pred)*100
    precise = precision_score(true, pred)*100
    return {'far':far,'precision':precise,'acc':accuracy,'recall':recall,'f1':f1}



In [9]:
# 1.3 写一个function 返回sklearn中 特征重要性排名和分数 的 dataframe
def get_classifer_ranked_feature(clf,feature_names):
    name_im = pd.DataFrame({'importance':clf.feature_importances_,'var':feature_names})
    name_im = name_im.sort_values(by='importance',ascending=False)
    return name_im

In [29]:
%%time
import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV


rfc = RandomForestClassifier()
xgbc =XGBClassifier(n_estimators=300,max_depth=6,use_label_encoder=False)
#n_estimators=100,max_depth=6, n_jobs=-1,learning_rate=0.15

#针对light-gbm
trn_data = lgb.Dataset(x_train, label = y_train)
val_data = lgb.Dataset(x_test, label = y_test)
lgbm = lgb.LGBMClassifier()

rfc.fit(x_train,y_train)
xgbc.fit(x_train,y_train)
lgbm.fit(x_train,y_train)

Wall time: 6.91 s


LGBMClassifier()

In [34]:
import joblib
joblib.dump(lgbm, 'hob_lgbm.model')

['hob_lgbm.model']

In [30]:
# print_score(xgbc,x_train,y_train,x_test,y_test)
# print_score(rfc,x_train,y_train,x_test,y_test)
# print_score(lgbm,x_train,y_train,x_test,y_test)

In [31]:
trn_pred = lgbm.predict(x_train)
tes_pred =lgbm.predict(x_test)
a=calc_metrics(y_train,trn_pred)
b=calc_metrics(y_test,tes_pred)
print('lgbm')
print(a)
print(b)

lgbm
{'far': 0.0, 'precision': 100.0, 'acc': 100.0, 'recall': 100.0, 'f1': 100.0}
{'far': 7.096774193548387, 'precision': 75.55555555555556, 'acc': 89.8989898989899, 'recall': 79.06976744186046, 'f1': 77.27272727272727}


In [32]:
trn_pred = xgbc.predict(x_train)
tes_pred =xgbc.predict(x_test)
a=calc_metrics(y_train,trn_pred)
b=calc_metrics(y_test,tes_pred)
print('xgbc')
print(a)
print(b)

xgbc
{'far': 0.0, 'precision': 100.0, 'acc': 100.0, 'recall': 100.0, 'f1': 100.0}
{'far': 7.741935483870968, 'precision': 73.33333333333333, 'acc': 88.88888888888889, 'recall': 76.74418604651163, 'f1': 74.99999999999999}


In [33]:
trn_pred = rfc.predict(x_train)
tes_pred =rfc.predict(x_test)
a=calc_metrics(y_train,trn_pred)
b=calc_metrics(y_test,tes_pred)
print('rfc')
print(a)
print(b)

rfc
{'far': 0.0, 'precision': 100.0, 'acc': 100.0, 'recall': 100.0, 'f1': 100.0}
{'far': 8.38709677419355, 'precision': 69.76744186046511, 'acc': 86.86868686868688, 'recall': 69.76744186046511, 'f1': 69.76744186046511}


# 3.选重要的特征

In [16]:
xgbc_fea = get_classifer_ranked_feature(xgbc,feature_names)
rfc_fea = get_classifer_ranked_feature(rfc,feature_names)

In [17]:
xgbc_fea.head(20)

Unnamed: 0,importance,var
134,0.045075,nHCsatu
39,0.042816,BCUTc-1l
111,0.031323,nHBint3
529,0.027837,maxsOH
471,0.023946,maxHBint6
66,0.023711,SCH-6
585,0.021712,hmin
292,0.019919,SdO
230,0.018329,SHBint3
349,0.017277,minHBint3


In [18]:
rfc_fea.head(20)

Unnamed: 0,importance,var
39,0.049053,BCUTc-1l
529,0.024372,maxsOH
476,0.0175,maxHsOH
238,0.017067,SHsOH
119,0.016356,nHsOH
673,0.016349,MLFER_A
291,0.015166,SsOH
357,0.013258,minHsOH
410,0.012845,minsOH
393,0.008878,minaasC
