In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!conda install -y -c rdkit rdkit;

In [None]:
!pip install git+https://github.com/samoturk/mol2vec

In [None]:
import itertools
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
import rdkit.Chem.Fragments as f
import rdkit.Chem.rdMolDescriptors as d
from rdkit.Chem import Lipinski

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV

from gensim.models import word2vec
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
from sklearn.preprocessing import MinMaxScaler

In [None]:
def load_data(path):
    return pd.read_csv(path, index_col=["INDEX"])


def get_smile(dataframe):
    data = dataframe.copy()
    data["rd_form"] = data.SMILES.apply(lambda x: Chem.MolFromSmiles(x))
    data.drop(["SMILES"], inplace=True, axis=1)
    return data


# Here test different features
def get_feature(dataframe):
    data = dataframe.copy()
    
    mol_model = word2vec.Word2Vec.load('../input/assignment4datas/model_300dim.pkl')
    
    data["num_atom"] = data["rd_form"].apply(lambda x:x.GetNumAtoms())
    data["mol_dr"] = data["rd_form"].apply(lambda x: d.CalcExactMolWt(x))
    data["COO"] = data["rd_form"].apply(lambda x: f.fr_Al_COO(x))
    data["OH"] = data["rd_form"].apply(lambda x: f.fr_Al_OH(x))
    data["ArN"] = data["rd_form"].apply(lambda x: f.fr_ArN(x))
    data["halogen"] = data["rd_form"].apply(lambda x: f.fr_halogen(x))
    data["aliphatic_ring_count"] = data["rd_form"].apply(lambda x: Lipinski.NumAliphaticRings(x))
    data["aromatic_ring_count"] = data["rd_form"].apply(lambda x: Lipinski.NumAromaticRings(x))
    

    # fingerprint
    fcfp_list = []    
    for fcpc in range(124):
        data["fcpc" + str(fcpc)] = 0
        fcfp_list.append("fcpc" + str(fcpc))
    fcpc_x = data["rd_form"].apply(lambda x: np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124, useFeatures=True)))
    fcpc_vector_lists = list(itertools.chain(*fcpc_x))
    data.loc[:, fcfp_list] = np.array(fcpc_vector_lists).reshape(len(data),124)
    
    ecfp_list = []
    for ecpc in range(124):
        data["ecpc" + str(ecpc)] = 0
        ecfp_list.append("ecpc" + str(ecpc))
    ecpc_x = data["rd_form"].apply(lambda x: np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124)))
    ecpc_vector_lists = list(itertools.chain(*ecpc_x))
    data.loc[:, ecfp_list] = np.array(ecpc_vector_lists).reshape(len(data),124)
    
    # mol2vec
    m2v_list = []
    data['sentence'] = data['rd_form'].apply(lambda x: mol2alt_sentence(x, radius=2))  
    for m2v_idx in range(300):
        data["m2v" + str(m2v_idx)] = 0
        m2v_list.append("m2v" + str(m2v_idx))
    m2v = [DfVec(x) for x in sentences2vec(data['sentence'], mol_model, unseen='UNK')]
    m2v = np.array([x.vec for x in m2v])
    data.loc[:,m2v_list] = m2v.reshape(len(data),300)
    data.drop(["sentence"], inplace=True, axis=1)
    
    min_max_scaler = MinMaxScaler()
    m2v_idx_list = []
    data['sentence'] = data['rd_form'].apply(lambda x: mol2alt_sentence(x, radius=2))
    for m2v_idx in range(300):
        data["m2v" + str(m2v_idx)] = 0
        m2v_idx_list.append("m2v" + str(m2v_idx))
    m2v_x = [DfVec(x) for x in sentences2vec(data['sentence'], mol_model, unseen='UNK')]
    m2v_list = [x.vec for x in m2v_x]
    m2v_vector_lists = list(itertools.chain(*m2v_list))
    m2v_array = np.array(m2v_vector_lists).reshape(len(data),300)
    m2v_minmax = min_max_scaler.fit_transform(m2v_array)
    data.loc[:, m2v_idx_list] = m2v_minmax
    data = data.drop(['sentence'], axis=1)
    
#     data['sentence'] = data['rd_form'].apply(lambda x: MolSentence(mol2alt_sentence(x, 1)), axis=1)
#     m2v = [DfVec(x) for x in sentences2vec(data['sentence'], mol_model, unseen='UNK')]
#     m2v = np.array([x.vec for x in m2v])
#     m2v = pd.DataFrame(m2v)
#     m2v.columns = ["m2v_"+str(x) for x in m2v.columns]
    
    return data


def cal_auc(prob, labels):
    f = list(zip(prob, labels))
    rank = [values2 for values1, values2 in sorted(f, key=lambda x: x[0])]
    rankList = [i + 1 for i in range(len(rank)) if rank[i] == 1]
    posNum = 0
    negNum = 0
    for i in range(len(labels)):
        if (labels[i] == 1):
            posNum += 1
        else:
            negNum += 1
    auc = (sum(rankList) - (posNum * (posNum + 1)) / 2) / (posNum * negNum)
    return auc

# imputate, normalize, discretize and train-test split the data
def split(train):
    train = train.drop(["rd_form"], axis=1)
    Y_train = train["ACTIVE"]
    X_train = train.drop(["ACTIVE"], axis=1)
    x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    scaler = MinMaxScaler()
    kbd = KBinsDiscretizer(n_bins=10, encode="ordinal")
    train_index = x_train.index
    x_train = imp_mean.fit_transform(x_train)
    x_train[:, [1, 2]] = scaler.fit_transform(x_train[:, [1, 2]])
    x_train[:, [1, 2]] = kbd.fit_transform(x_train[:, [1, 2]])
    x_train = pd.DataFrame(x_train, columns=X_train.columns, index=train_index)
    test_index = x_val.index
    x_val = imp_mean.transform(x_val)
    x_val[:, [1, 2]] = scaler.transform(x_val[:, [1, 2]])
    x_val[:, [1, 2]] = kbd.transform(x_val[:, [1, 2]])
    x_val = pd.DataFrame(x_val, columns=X_train.columns, index=test_index)
    return x_train, x_val, y_train, y_val

def get_train_labels(train):
    train = train.drop(["rd_form"], axis=1)
    y_train = train["ACTIVE"]
    x_train = train.drop(["ACTIVE"], axis=1)
    x_train_columns = x_train.columns
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    scaler = MinMaxScaler()
    kbd = KBinsDiscretizer(n_bins=10, encode="ordinal")
    train_index = x_train.index
    x_train = imp_mean.fit_transform(x_train)
    x_train[:, [1, 2]] = scaler.fit_transform(x_train[:, [1, 2]])
    x_train[:, [1, 2]] = kbd.fit_transform(x_train[:, [1, 2]])
    x_train = pd.DataFrame(x_train, columns=x_train_columns, index=train_index)

    return x_train, y_train
    

In [None]:
def get_all_feature(dataframe):
    data = dataframe.copy()
    
    mol_model = word2vec.Word2Vec.load('/kaggle/input/assignment4datas/model_300dim.pkl')
    
    data["num_atom"] = data["rd_form"].apply(lambda x:x.GetNumAtoms())
    data["mol_dr"] = data["rd_form"].apply(lambda x: d.CalcExactMolWt(x))
    data["COO"] = data["rd_form"].apply(lambda x: f.fr_Al_COO(x))
    
    data["OH"] = data["rd_form"].apply(lambda x: f.fr_Al_OH(x))
    data["ArN"] = data["rd_form"].apply(lambda x: f.fr_ArN(x))
    data["halogen"] = data["rd_form"].apply(lambda x: f.fr_halogen(x))
    data["aliphatic_ring_count"] = data["rd_form"].apply(lambda x: Lipinski.NumAliphaticRings(x))
    data["aromatic_ring_count"] = data["rd_form"].apply(lambda x: Lipinski.NumAromaticRings(x))
    
    
    # fingerprint
    fcfp_list = []    
    for fcpc in range(124):
        data["fcpc" + str(fcpc)] = 0
        fcfp_list.append("fcpc" + str(fcpc))
    fcpc_x = data["rd_form"].apply(lambda x: np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124, useFeatures=True)))
    fcpc_vector_lists = list(itertools.chain(*fcpc_x))
    data.loc[:, fcfp_list] = np.array(fcpc_vector_lists).reshape(len(data),124)
    
    ecfp_list = []
    for ecpc in range(124):
        data["ecpc" + str(ecpc)] = 0
        ecfp_list.append("ecpc" + str(ecpc))
    ecpc_x = data["rd_form"].apply(lambda x: np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124)))
    ecpc_vector_lists = list(itertools.chain(*ecpc_x))
    data.loc[:, ecfp_list] = np.array(ecpc_vector_lists).reshape(len(data),124)
    
    # mol2vec
    m2v_list = []
    data['sentence'] = data['rd_form'].apply(lambda x: mol2alt_sentence(x, radius=2))  
    for m2v_idx in range(300):
        data["m2v" + str(m2v_idx)] = 0
        m2v_list.append("m2v" + str(m2v_idx))
    m2v = [DfVec(x) for x in sentences2vec(data['sentence'], mol_model, unseen='UNK')]
    m2v = np.array([x.vec for x in m2v])
    data.loc[:,m2v_list] = m2v.reshape(len(data),300)
    data.drop(["sentence"], inplace=True, axis=1)
    

    
    return data

def get_ecfp_feature(dataframe):
    data = dataframe.copy()
    
    mol_model = word2vec.Word2Vec.load('/kaggle/input/assignment4datas/model_300dim.pkl')
    
    data["num_atom"] = data["rd_form"].apply(lambda x:x.GetNumAtoms())
    data["mol_dr"] = data["rd_form"].apply(lambda x: d.CalcExactMolWt(x))
    data["COO"] = data["rd_form"].apply(lambda x: f.fr_Al_COO(x))
    
    data["OH"] = data["rd_form"].apply(lambda x: f.fr_Al_OH(x))
    data["ArN"] = data["rd_form"].apply(lambda x: f.fr_ArN(x))
    data["halogen"] = data["rd_form"].apply(lambda x: f.fr_halogen(x))
    data["aliphatic_ring_count"] = data["rd_form"].apply(lambda x: Lipinski.NumAliphaticRings(x))
    data["aromatic_ring_count"] = data["rd_form"].apply(lambda x: Lipinski.NumAromaticRings(x))
    
    ecfp_list = []
    for ecpc in range(124):
        data["ecpc" + str(ecpc)] = 0
        ecfp_list.append("ecpc" + str(ecpc))
    ecpc_x = data["rd_form"].apply(lambda x: np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124)))
    ecpc_vector_lists = list(itertools.chain(*ecpc_x))
    data.loc[:, ecfp_list] = np.array(ecpc_vector_lists).reshape(len(data),124)
    
    return data

def get_fcfp_feature(dataframe):
    data = dataframe.copy()
    
    mol_model = word2vec.Word2Vec.load('/kaggle/input/assignment4datas/model_300dim.pkl')
    
    data["num_atom"] = data["rd_form"].apply(lambda x:x.GetNumAtoms())
    data["mol_dr"] = data["rd_form"].apply(lambda x: d.CalcExactMolWt(x))
    data["COO"] = data["rd_form"].apply(lambda x: f.fr_Al_COO(x))
    
    data["OH"] = data["rd_form"].apply(lambda x: f.fr_Al_OH(x))
    data["ArN"] = data["rd_form"].apply(lambda x: f.fr_ArN(x))
    data["halogen"] = data["rd_form"].apply(lambda x: f.fr_halogen(x))
    data["aliphatic_ring_count"] = data["rd_form"].apply(lambda x: Lipinski.NumAliphaticRings(x))
    data["aromatic_ring_count"] = data["rd_form"].apply(lambda x: Lipinski.NumAromaticRings(x))
    
    
    # fingerprint
    fcfp_list = []    
    for fcpc in range(124):
        data["fcpc" + str(fcpc)] = 0
        fcfp_list.append("fcpc" + str(fcpc))
    fcpc_x = data["rd_form"].apply(lambda x: np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124, useFeatures=True)))
    fcpc_vector_lists = list(itertools.chain(*fcpc_x))
    data.loc[:, fcfp_list] = np.array(fcpc_vector_lists).reshape(len(data),124)
    
    return data



In [None]:
def binary_bayes(x_train, x_val,  y_train, y_val):
    model = BernoulliNB()
    model.fit(x_train, y_train)
    prediction = model.predict_proba(x_val)
    auc = cal_auc(prediction[:, 1], np.array(y_val))
    return model, auc, prediction


def guassian_bayes(x_train, x_val,  y_train, y_val):
    model = GaussianNB()
    model.fit(x_train, y_train)
    prediction = model.predict_proba(x_val)
    auc = cal_auc(prediction[:, 1], np.array(y_val))
    return model, auc, prediction


def multi_bayes(x_train, x_val,  y_train, y_val):
    model = MultinomialNB()
    model.fit(x_train, y_train)
    prediction = model.predict_proba(x_val)
    auc = cal_auc(prediction[:, 1], np.array(y_val))
    return model, auc, prediction


def decision_tree(x_train, x_val,  y_train, y_val):
    model = DecisionTreeClassifier()
    model.fit(x_train, y_train)
    prediction = model.predict_proba(x_val)
    auc = cal_auc(prediction[:, 1], np.array(y_val))
    return model, auc, prediction


def mlp(x_train, x_val,  y_train, y_val):
    model = MLPClassifier()
    model.fit(x_train, y_train)
    prediction = model.predict_proba(x_val)
    auc = cal_auc(prediction[:, 1], np.array(y_val))
    return model, auc, prediction


def random_forest(x_train, x_val,  y_train, y_val):
    model = RandomForestClassifier(max_depth=30, n_estimators=400, class_weight='balanced_subsample') 
    model.fit(x_train, y_train)
    prediction = model.predict_proba(x_val)
#     auc = cal_auc(prediction[:, 1], np.array(y_val))
    from sklearn.metrics import roc_auc_score
    auc = roc_auc_score(np.array(y_val), np.array(prediction[:, 1]))

    return model, auc, prediction


def light_boost(x_train, x_val,  y_train, y_val):
    model = lgb.LGBMClassifier()
    model.fit(x_train, y_train)
    prediction = model.predict_proba(x_val)
    auc = cal_auc(prediction[:, 1], np.array(y_val))
    return model, auc, prediction


def extreme_boost(x_train, x_val,  y_train, y_val):
    model = xgb.XGBClassifier()
    model.fit(x_train, y_train)
    prediction = model.predict_proba(x_val)
    auc = cal_auc(prediction[:, 1], np.array(y_val))
    return model, auc, prediction

In [None]:
train_path = "/kaggle/input/assignment4datas/training_smiles.csv"
test_path = "/kaggle/input/assignment4datas/test_smiles.csv"
pre_train = load_data(train_path)
pre_test = load_data(test_path)
# data = get_smile(pre_train).sample(n=10, random_state=2)
data = get_smile(pre_train)

train = get_feature(data)

# train = get_feature(get_smile(pre_train))
# test = get_feature(get_smile(pre_test))
### Split
x_train, x_val, y_train, y_val = split(train)
lb, lb_auc, lb_prediction = light_boost(x_train,x_val,y_train, y_val)
eb, eb_auc, en_prediction = extreme_boost(x_train,x_val,y_train, y_val)
rf, rf_auc, rf_prediction = random_forest(x_train,x_val,y_train, y_val)
print(lb_auc)
print(eb_auc)
print(rf_auc)

In [None]:
data = get_smile(pre_train)
train = get_feature(data)
# train = get_feature(get_smile(pre_train))
# test = get_feature(get_smile(pre_test))
### Split
x_train, x_val, y_train, y_val = split(train)
lb_params = {
    'learning_rate': 0.2,
    'num_leaves': 60,
    'n_estimators': 250,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'subsample': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0,
}
lb, lb_auc, lb_prediction = light_boost(x_train,x_val,y_train, y_val)
rf, rf_auc, rf_prediction = random_forest(x_train,x_val,y_train, y_val)
eb, eb_auc, en_prediction = extreme_boost(x_train,x_val,y_train, y_val)

update_model = lgb.LGBMClassifier(**lb_params)
update_model.fit(x_train, y_train)
update_prediction = update_model.predict_proba(x_val)
update_auc = cal_auc(update_prediction[:, 1], np.array(y_val))
print(lb_auc)
print(rf_auc)
print(eb_auc)
print(update_auc)

In [None]:
kf = KFold(n_splits=5, random_state=42, shuffle=False)
# lb_params = {
#     'learning_rate': [0.005],
#     'num_leaves': [30, 40, 50, 60],
#     'n_estimators': [75, 150, 225],
#     'boosting_type': ['gbdt'],
#     'objective': ['binary'],
#     'subsample': [0.7, 0.75, 0.8],
#     'reg_alpha': [0, 0.1, 0.2],
#     'reg_lambda': [0, 0.1, 0.2],
# }

# xgb_params = {
#     'learning_rate': [0.01, 0.1, 0.2],
#     'n_estimators': [300, 400],
#     'gamma': [0],
#     'max_depth': [24, 36],
#     'subsample': [0.8],
#     'min_child_weight': [1],
# }

In [None]:
fcfp_data = load_data("/kaggle/input/assignment4datas/train_fcfp .csv")
m2v_data = load_data("/kaggle/input/assignment4datas/train_m2v.csv")
ecfp_data = load_data("/kaggle/input/assignment4datas/train_ecfp.csv")

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
def kFoldAUC(model, X_train, y_train):
    kf = KFold(n_splits=5, random_state=42, shuffle=False)
    rf = RandomForestClassifier(class_weight='balanced_subsample', max_depth=30,
                       n_estimators=400)
    print(cross_val_score(rf, X_train, y_train, cv=kf, scoring='roc_auc'))

In [None]:
# fcfp or ecfp
def rf_auc_1(dataframe):
    data = dataframe.copy()
    x_train, x_val, y_train, y_val = split(data)
    rf, rf_auc, rf_prediction = random_forest(x_train,x_val,y_train, y_val)
    
    print("kFold predict ...")
    x_train, y_train = get_train_labels(data)
    kFoldAUC(rf,x_train, y_train)
    return rf, rf_auc, rf_prediction

# m2v
def rf_auc_m2v(dataframe):
    data = dataframe.copy()
    x_train, x_val, y_train, y_val = split(data)
    
    min_max_scaler = MinMaxScaler()
    # normalize the m2v columns
    m2v_array_x_train=x_train.loc[:, ["m2v"+str(i) for i in range(300)]]
    x_train.loc[:, ["m2v"+str(i) for i in range(300)]] = min_max_scaler.fit_transform(m2v_array_x_train)
    m2v_array_x_val=x_val.loc[:, ["m2v"+str(i) for i in range(300)]]
    x_val.loc[:, ["m2v"+str(i) for i in range(300)]] = min_max_scaler.transform(m2v_array_x_val)

    rf, rf_auc, rf_prediction = random_forest(x_train,x_val,y_train, y_val)
    return rf, rf_auc, rf_prediction

# fcfp + ecfp
def rf_auc_2(dataframe1, dataframe2):
    data1 = dataframe1.copy()
    data2 = dataframe2.copy()
    ecfp_list = ["ecpc"+str(i) for i in range(124)]
    data1[ecfp_list] = np.array(data2.loc[:,ecfp_list])
    
    x_train, x_val, y_train, y_val = split(data1)
    rf, rf_auc, rf_prediction = random_forest(x_train,x_val,y_train, y_val)
    return rf, rf_auc, rf_prediction

# fcfp + ecfp + m2v
def rf_auc_3(dataframe1, dataframe2, dataframe3):
    data1 = dataframe1.copy()
    data2 = dataframe2.copy()
    data3 = dataframe3.copy()
    fcfp_list = ["fcpc"+str(i) for i in range(124)]
    ecfp_list = ["ecpc"+str(i) for i in range(124)]
    data3[fcfp_list] = np.array(data1.loc[:, fcfp_list])
    data3[ecfp_list] = np.array(data2.loc[:, ecfp_list])
    
    min_max_scaler = MinMaxScaler()
    x_train, x_val, y_train, y_val = split(data3)
    
    # normalize the m2v columns
    m2v_array_x_train=x_train.loc[:, ["m2v"+str(i) for i in range(300)]]
    x_train.loc[:, ["m2v"+str(i) for i in range(300)]] = min_max_scaler.fit_transform(m2v_array_x_train)

    m2v_array_x_val=x_val.loc[:, ["m2v"+str(i) for i in range(300)]]
    x_val.loc[:, ["m2v"+str(i) for i in range(300)]] = min_max_scaler.transform(m2v_array_x_val)
    


    rf, rf_auc, rf_prediction = random_forest(x_train, x_val, y_train, y_val)
    return rf, rf_auc, rf_prediction


def rf_auc_final(dataframe1, dataframe2, dataframe3):
    data1 = dataframe1.copy()
    data2 = dataframe2.copy()
    data3 = dataframe3.copy()
    fcfp_list = ["fcpc"+str(i) for i in range(124)]
    ecfp_list = ["ecpc"+str(i) for i in range(124)]
    data3[fcfp_list] = np.array(data1.loc[:, fcfp_list])
    data3[ecfp_list] = np.array(data2.loc[:, ecfp_list])
    
    print("get x_train and y_train ...")
    x_train, y_train = get_train_labels_final(data3)
    print("kFold predict ...")
    kFoldAUC(rf, x_train, y_train)

def get_train_labels_final(train):
    train = train.drop(["rd_form"], axis=1)
    y_train = train["ACTIVE"]
    x_train = train.drop(["ACTIVE"], axis=1)
    x_train_columns = x_train.columns
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    scaler = MinMaxScaler()
    kbd = KBinsDiscretizer(n_bins=10, encode="ordinal")
    train_index = x_train.index
    
    m2v_list = ["m2v"+str(i) for i in range(300)]
    
    x_train = imp_mean.fit_transform(x_train)
    x_train[:, [1, 2]+m2v_list] = scaler.fit_transform(x_train[:, [1, 2]])
    x_train[:, [1, 2]+m2v_list] = kbd.fit_transform(x_train[:, [1, 2]])
    x_train = pd.DataFrame(x_train, columns=x_train_columns, index=train_index)

    return x_train, y_train

In [None]:
rf_auc_final(fcfp_data,ecfp_data, m2v_data)

In [None]:
# test fingerprint(True)
rf, rf_auc, rf_prediction = rf_auc_1(fcfp_data)
print(rf_auc)

In [None]:
# test fingerprint(False)
rf, rf_auc, rf_prediction = rf_auc_1(ecfp_data)
print(rf_auc)

In [None]:
# test m2v
rf, rf_auc, rf_prediction = rf_auc_m2v(m2v_data)
print(rf_auc)

In [None]:
# test fingerprint(True+False)
rf, rf_auc, rf_prediction = rf_auc_2(fcfp_data, ecfp_data)
print(rf_auc)

In [None]:
# test fingerprint(True+False) + m2v
rf, rf_auc, rf_prediction = rf_auc_3(fcfp_data, ecfp_data, m2v_data)
print(rf_auc)

In [None]:
kf = KFold(n_splits=5, random_state=42, shuffle=False)
grid_search = GridSearchCV(lb, param_grid=lb_params, cv=kf, scoring='roc_auc')
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_
update_model = lgb.LGBMClassifier(**best_params)
update_model.fit(x_train, y_train)
update_prediction = update_model.predict_proba(x_val)
update_auc = cal_auc(update_prediction[:, 1], np.array(y_val))