In [21]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold, RFE, RFECV, f_regression, mutual_info_regression, SelectPercentile, SelectFromModel
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression as LR
from time import time
import global_config as cfg
import pandas as pd

In [22]:
def get_X_Y(csvfile):
    df = pd.read_csv(csvfile)
    print("Initial: ", df.shape)

    df = df.dropna(axis=1)
    print("DropNA: ", df.shape)
    df = df.drop_duplicates()
    print("Drop duplicates: ", df.shape)
    
    X = df.drop(['SMILES', 'Blood', 'Brain', 'Ratio'], axis=1).astype('float64')
    X = MinMaxScaler().fit_transform(X)
    # print(len(X))
    blood_y = df['Blood'].ravel()
    brain_y = df['Brain'].ravel()
    ratio_y = df['Ratio'].ravel()
    SMILES = df['SMILES']
    return X, blood_y, brain_y, ratio_y, SMILES

In [23]:
def get_VT(thres=0.02):
    # deleted all features that were either one or zero in more than 98% of samples
    selector = VarianceThreshold(thres)
    return selector

def get_RFE(n):
    from sklearn.ensemble import RandomForestRegressor as RFR
    
    # base estimator SVM
    # estimator = SVC(kernel="rbf")
    # estimator = LR(max_iter=10000, solver='liblinear', class_weight='balanced')
    estimator = RFR()
    selector = RFE(estimator=estimator, n_features_to_select=n)
    # selector = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(2),
    #           scoring='accuracy', n_jobs=-1)
    return selector

def get_UFE(percentile=80):
    selector = SelectPercentile(score_func=mutual_info_regression, percentile=percentile)
    return selector


def tree_based_selection(X, y):
    clf = ExtraTreesRegressor()
    clf = clf.fit(X, y)
    # print(clf.feature_importances_)
    # array([ 0.04...,  0.05...,  0.4...,  0.4...])
    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X)
    # print(X_new.shape)
    return X_new, clf.feature_importances_



In [24]:
desc_csvfilepath = cfg.padel_csvfilepath
X, blood_y, brain_y, ratio_y, SMILES = get_X_Y(desc_csvfilepath)
origin_X = X
y = blood_y
print(X.shape)

  


Initial:  (414, 8093)
DropNA:  (414, 1725)
Drop duplicates:  (358, 1725)
(358, 1721)


In [25]:
selector = get_VT(0.02)

X = selector.fit_transform(X, y)

print(X.shape)

(358, 753)


In [26]:
X, fea = tree_based_selection(X, y)
print(X.shape)

(358, 56)


In [27]:
selector = get_UFE()

X = selector.fit_transform(X, y)
print(X.shape)
print(X)

(358, 44)
[[0.35483871 0.32352941 0.87205263 ... 0.         1.         1.        ]
 [0.38709677 0.35294118 0.6858884  ... 1.         1.         1.        ]
 [0.38709677 0.35294118 0.7049152  ... 1.         1.         1.        ]
 ...
 [0.         0.         0.3435903  ... 0.         0.         0.        ]
 [0.38709677 0.35294118 0.40730173 ... 1.         1.         1.        ]
 [0.58064516 0.52941176 0.59453575 ... 1.         1.         1.        ]]


In [28]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components = 0.95)
# X = pca.fit_transform(X)
# print(X.shape)
# print(X)

In [29]:
selector = get_RFE(30)

X = selector.fit_transform(X, y)
print(X.shape)
print(X)

(358, 30)
[[0.35483871 0.32352941 0.87205263 ... 1.         0.         1.        ]
 [0.38709677 0.35294118 0.6858884  ... 1.         1.         1.        ]
 [0.38709677 0.35294118 0.7049152  ... 1.         1.         1.        ]
 ...
 [0.         0.         0.3435903  ... 0.         0.         0.        ]
 [0.38709677 0.35294118 0.40730173 ... 1.         1.         1.        ]
 [0.58064516 0.52941176 0.59453575 ... 1.         1.         1.        ]]


In [30]:
X = pd.DataFrame(X)
origin_X = pd.DataFrame(origin_X)
# print(X.columns.to_list())
column_header = []
for idx, col in X.iteritems():
    for origin_idx, origin_col in origin_X.iteritems():
        if col.equals(origin_col):
            column_header.append(origin_idx)
            break
print(column_header)
print(len(column_header))

[1, 2, 17, 82, 98, 100, 106, 122, 124, 129, 156, 163, 203, 220, 226, 299, 303, 312, 313, 314, 317, 320, 350, 352, 362, 461, 903, 1003, 1188, 1242]
30
