In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold, RFE, RFECV, f_regression, mutual_info_regression, SelectPercentile, SelectFromModel
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression as LR
from time import time
import global_config as cfg
import pandas as pd

In [2]:
def get_X_Y(csvfile):
    df = pd.read_csv(csvfile)
    print("Initial: ", df.shape)

    df = df.dropna(axis=1)
    print("DropNA: ", df.shape)
    df = df.drop_duplicates()
    print("Drop duplicates: ", df.shape)
    
    X = df.drop(['SMILES', 'Blood', 'Brain', 'Ratio'], axis=1).astype('float64')
    X = MinMaxScaler().fit_transform(X)
    # print(len(X))
    blood_y = df['Blood'].ravel()
    brain_y = df['Brain'].ravel()
    ratio_y = df['Ratio'].ravel()
    SMILES = df['SMILES']
    return X, blood_y, brain_y, ratio_y, SMILES

In [3]:
def get_VT(thres=0.02):
    # deleted all features that were either one or zero in more than 98% of samples
    selector = VarianceThreshold(thres)
    return selector

def get_RFE(n):
    from sklearn.ensemble import RandomForestRegressor as RFR
    
    # base estimator SVM
    # estimator = SVC(kernel="rbf")
    # estimator = LR(max_iter=10000, solver='liblinear', class_weight='balanced')
    estimator = RFR()
    selector = RFE(estimator=estimator, n_features_to_select=n)
    # selector = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(2),
    #           scoring='accuracy', n_jobs=-1)
    return selector

def get_UFE(percentile=80):
    selector = SelectPercentile(score_func=mutual_info_regression, percentile=percentile)
    return selector


def tree_based_selection(X, y):
    clf = ExtraTreesRegressor()
    clf = clf.fit(X, y)
    # print(clf.feature_importances_)
    # array([ 0.04...,  0.05...,  0.4...,  0.4...])
    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X)
    # print(X_new.shape)
    return X_new, clf.feature_importances_



In [4]:
desc_csvfilepath = cfg.padel_csvfilepath
X, blood_y, brain_y, ratio_y, SMILES = get_X_Y(desc_csvfilepath)
origin_X = X
y = ratio_y
print(X.shape)

  


Initial:  (414, 9706)
DropNA:  (414, 3309)
Drop duplicates:  (358, 3309)
(358, 3305)


In [5]:
selector = get_VT(0.02)

X = selector.fit_transform(X, y)

print(X.shape)

(358, 1385)


In [6]:
X, fea = tree_based_selection(X, y)
print(X.shape)

(358, 108)


In [7]:
selector = get_UFE()

X = selector.fit_transform(X, y)
print(X.shape)
print(X)

(358, 86)
[[0.34375    0.17272727 0.13500482 ... 1.         1.         1.        ]
 [0.375      0.29166667 0.25650916 ... 1.         1.         1.        ]
 [0.375      0.17575758 0.19864995 ... 1.         1.         1.        ]
 ...
 [0.1875     0.27878788 0.07618129 ... 0.         0.         0.        ]
 [0.375      0.38787879 0.34715526 ... 1.         1.         0.        ]
 [0.5625     0.53207071 0.58052073 ... 1.         1.         0.        ]]


In [8]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components = 0.95)
# X = pca.fit_transform(X)
# print(X.shape)
# print(X)

In [9]:
selector = get_RFE(50)

X = selector.fit_transform(X, y)
print(X.shape)
print(X)

(358, 50)
[[0.17272727 0.13500482 0.25402322 ... 0.76107454 0.14885664 0.56809405]
 [0.29166667 0.25650916 0.32824506 ... 0.81706112 0.27616535 0.50799694]
 [0.17575758 0.19864995 0.28921875 ... 0.77503097 0.15127529 0.28341585]
 ...
 [0.27878788 0.07618129 0.27978368 ... 0.57646998 0.24131486 0.        ]
 [0.38787879 0.34715526 0.4791944  ... 0.7110843  0.36587511 0.32274033]
 [0.53207071 0.58052073 0.62232751 ... 0.84812377 0.43359719 0.33051587]]


In [10]:
X = pd.DataFrame(X)
origin_X = pd.DataFrame(origin_X)
# print(X.columns.to_list())
column_header = []
for idx, col in X.iteritems():
    for origin_idx, origin_col in origin_X.iteritems():
        if col.equals(origin_col):
            column_header.append(origin_idx)
            break
print(column_header)
print(len(column_header))

[39, 43, 73, 74, 144, 222, 413, 453, 457, 497, 547, 570, 602, 626, 648, 704, 755, 781, 820, 825, 847, 907, 981, 1057, 1061, 1065, 1139, 1140, 1143, 1209, 1213, 1221, 1226, 1259, 1383, 1432, 1534, 1543, 1601, 1666, 1682, 1690, 1708, 1723, 1747, 1755, 1802, 1808, 1891, 1907]
50
