In [89]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold, RFE, RFECV, f_regression, mutual_info_regression, SelectPercentile, SelectFromModel
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression as LR
from time import time
import global_config as cfg
import pandas as pd
import numpy as np

In [90]:
def get_X_Y(csvfile):
    df = pd.read_csv(csvfile)
    print("Initial: ", df.shape)
    df = df.replace(["#NAME?", np.inf, -np.inf], np.nan)
    df = df.dropna(axis=1)
    print("DropNA: ", df.shape)
    # df = df.drop_duplicates()
    # print("Drop duplicates: ", df.shape)
    
    X = df.drop(['SMILES', 'Blood', 'Brain', 'Ratio'], axis=1).astype('float64')
    X = MinMaxScaler().fit_transform(X)
    # print(len(X))
    blood_y = df['Blood'].ravel()
    brain_y = df['Brain'].ravel()
    ratio_y = df['Ratio'].ravel()
    SMILES = df['SMILES']
    return X, blood_y, brain_y, ratio_y, SMILES

In [91]:
def get_VT(thres=0.02):
    # deleted all features that were either one or zero in more than 98% of samples
    selector = VarianceThreshold(thres)
    return selector

def get_RFE(n):
    from sklearn.ensemble import RandomForestRegressor as RFR
    
    # base estimator SVM
    # estimator = SVC(kernel="rbf")
    # estimator = LR(max_iter=10000, solver='liblinear', class_weight='balanced')
    estimator = RFR()
    selector = RFE(estimator=estimator, n_features_to_select=n)
    # selector = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(2),
    #           scoring='accuracy', n_jobs=-1)
    return selector

def get_UFE(percentile=80):
    selector = SelectPercentile(score_func=mutual_info_regression, percentile=percentile)
    return selector


def tree_based_selection(X, y):
    clf = ExtraTreesRegressor()
    clf = clf.fit(X, y)
    # print(clf.feature_importances_)
    # array([ 0.04...,  0.05...,  0.4...,  0.4...])
    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X)
    # print(X_new.shape)
    return X_new, clf.feature_importances_



In [92]:
def feature_extraction(X, y, threshold=0.02, n_features_to_select=50):
    selector = get_VT(threshold)
    X = selector.fit_transform(X, y)
    print(X.shape)
    
    X, fea = tree_based_selection(X, y)
    print(X.shape)
    
    selector = get_UFE()
    X = selector.fit_transform(X, y)
    print(X.shape)
    
    selector = get_RFE(n_features_to_select)

    X = selector.fit_transform(X, y)
    print(X.shape)

    return X

In [93]:
# selector = get_VT(0.02)

# X = selector.fit_transform(X, y)

# print(X.shape)

In [94]:
# X, fea = tree_based_selection(X, y)
# print(X.shape)

In [95]:
# selector = get_UFE()

# X = selector.fit_transform(X, y)
# print(X.shape)
# print(X)

In [96]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components = 0.95)
# X = pca.fit_transform(X)
# print(X.shape)
# print(X)

In [97]:
# selector = get_RFE(30)

# X = selector.fit_transform(X, y)
# print(X.shape)
# print(X)

In [98]:
def get_feature_column_index(X, origin_X):
    if type(X) is not pd.DataFrame:
        X = pd.DataFrame(X)
    if type(origin_X) is not pd.DataFrame:
        origin_X = pd.DataFrame(origin_X)
    # print(X.columns.to_list())
    column_header = []
    for idx, col in X.iteritems():
        for origin_idx, origin_col in origin_X.iteritems():
            if col.equals(origin_col):
                column_header.append(origin_idx)
                break
    return column_header

In [99]:
desc_csvfilepath = cfg.padel_csvfilepath
X, blood_y, brain_y, ratio_y, SMILES = get_X_Y(desc_csvfilepath)
# origin_X = X
print(X.shape)

  


Initial:  (411, 9196)
DropNA:  (411, 3576)
(411, 3572)


In [100]:
print("Blood feature extraction:")
y = blood_y
extracted_X = feature_extraction(X, y, n_features_to_select=50)
column_header = get_feature_column_index(extracted_X, X)
print(column_header)
print(len(column_header))

Brain feature extraction:
(411, 2060)
(411, 186)
(411, 148)
(411, 50)
[51, 101, 108, 133, 140, 166, 193, 209, 217, 233, 234, 238, 250, 261, 262, 278, 282, 283, 289, 293, 298, 306, 353, 635, 638, 709, 820, 822, 849, 853, 857, 860, 862, 877, 878, 903, 922, 965, 968, 969, 971, 1043, 1061, 1063, 1065, 1090, 1113, 1250, 1865, 2078]
50


In [101]:
print("Brain feature extraction:")
y = brain_y
extracted_X = feature_extraction(X, y, n_features_to_select=50)
column_header = get_feature_column_index(extracted_X, X)
print(column_header)
print(len(column_header))

Brain feature extraction:
(411, 2060)
(411, 164)
(411, 131)
(411, 50)
[2, 25, 26, 27, 28, 64, 69, 93, 109, 166, 167, 193, 209, 217, 233, 238, 258, 273, 274, 280, 282, 289, 294, 306, 540, 638, 704, 768, 837, 842, 849, 851, 865, 875, 909, 948, 949, 967, 1051, 1054, 1055, 1056, 1065, 1066, 1113, 1318, 1393, 1678, 1754, 1820]
50


In [102]:
print("Ratio feature extraction:")
y = ratio_y
extracted_X = feature_extraction(X, y, n_features_to_select=50)
column_header = get_feature_column_index(extracted_X, X)
print(column_header)
print(len(column_header))

Ratio feature extraction:
(411, 2060)
(411, 156)
(411, 124)
(411, 50)
[26, 82, 133, 140, 152, 175, 201, 204, 218, 246, 261, 262, 269, 280, 286, 351, 352, 484, 507, 591, 592, 618, 629, 632, 641, 728, 750, 763, 769, 820, 822, 832, 838, 842, 845, 851, 855, 863, 865, 893, 933, 966, 969, 1062, 1063, 1065, 1067, 1077, 1089, 3290]
50
