In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold, RFE, RFECV, f_regression, mutual_info_regression, SelectPercentile, SelectFromModel
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression as LR
from time import time
import pandas as pd

In [11]:
def get_X_Y(csvfile):
    df = pd.read_csv(csvfile)
    print("Initial: ", df.shape)

    df = df.dropna(axis=1)
    print("DropNA: ", df.shape)
    df = df.drop_duplicates()
    print("Drop duplicates: ", df.shape)
    
    X = df.drop(['SMILES', 'Blood', 'Brain', 'Ratio'], axis=1).astype('float64')
    X = MinMaxScaler().fit_transform(X)
    # print(len(X))
    blood_y = df['Blood'].ravel()
    brain_y = df['Brain'].ravel()
    ratio_y = df['Ratio'].ravel()
    SMILES = df['SMILES']
    return X, blood_y, brain_y, ratio_y, SMILES

In [12]:
def get_VT(thres=0.02):
    # deleted all features that were either one or zero in more than 98% of samples
    selector = VarianceThreshold(thres)
    return selector

def get_RFE(n):
    from sklearn.ensemble import RandomForestRegressor as RFR
    
    # base estimator SVM
    # estimator = SVC(kernel="rbf")
    # estimator = LR(max_iter=10000, solver='liblinear', class_weight='balanced')
    estimator = RFR()
    selector = RFE(estimator=estimator, n_features_to_select=n)
    # selector = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(2),
    #           scoring='accuracy', n_jobs=-1)
    return selector

def get_UFE(percentile=80):
    # use F-test algorithm
    selector = SelectPercentile(score_func=mutual_info_regression, percentile=percentile)
    return selector


def tree_based_selection(X, y):
    clf = ExtraTreesClassifier()
    clf = clf.fit(X, y)
    # print(clf.feature_importances_)
    # array([ 0.04...,  0.05...,  0.4...,  0.4...])
    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X)
    # print(X_new.shape)
    return X_new, clf.feature_importances_


In [13]:
filetime = "20221221"
desc_csvfilepath = f"./result/{filetime}/RatioDescriptors.csv"
X, blood_y, brain_y, ratio_y, SMILES = get_X_Y(desc_csvfilepath)
origin_X = X
y = blood_y
print(X.shape)

Initial:  (420, 1617)
DropNA:  (420, 1588)
Drop duplicates:  (364, 1588)
(364, 1584)


In [14]:
selector = get_VT(0.02)

X = selector.fit_transform(X, y)

print(X.shape)

(364, 627)


In [15]:
# X, fea = tree_based_selection(X, y)
# print(X.shape)

In [16]:
selector = get_UFE()

X = selector.fit_transform(X, y)
print(X.shape)

(364, 501)


In [17]:
selector = get_RFE(200)

X = selector.fit_transform(X, y)
print(X.shape)

(364, 200)


In [18]:
X = pd.DataFrame(X)
origin_X = pd.DataFrame(origin_X)
# print(X.columns.to_list())
column_header = []
for idx, col in X.iteritems():
    for origin_idx, origin_col in origin_X.iteritems():
        if col.equals(origin_col):
            column_header.append(origin_idx)
            break
print(column_header)
print(len(column_header))

[5, 10, 60, 76, 111, 129, 132, 142, 150, 151, 162, 164, 168, 183, 184, 186, 196, 203, 220, 222, 223, 230, 231, 243, 246, 247, 254, 255, 261, 264, 270, 289, 300, 310, 311, 320, 323, 325, 333, 334, 337, 338, 350, 359, 368, 369, 396, 397, 404, 405, 414, 422, 441, 446, 449, 452, 453, 458, 465, 466, 474, 481, 489, 497, 498, 502, 506, 513, 514, 522, 529, 530, 534, 540, 541, 542, 546, 549, 561, 565, 566, 567, 568, 570, 572, 581, 582, 589, 594, 597, 598, 599, 600, 602, 604, 610, 618, 626, 629, 630, 631, 632, 637, 638, 642, 644, 645, 646, 648, 665, 781, 788, 791, 802, 803, 807, 814, 816, 825, 832, 838, 846, 847, 848, 978, 981, 986, 987, 999, 1055, 1057, 1065, 1072, 1086, 1134, 1136, 1139, 1143, 1144, 1145, 1156, 1157, 1161, 1163, 1165, 1209, 1211, 1217, 1219, 1221, 1226, 1228, 1232, 1235, 1236, 1238, 1245, 1249, 1250, 1252, 1253, 1257, 1259, 1262, 1263, 1266, 1278, 1281, 1282, 1286, 1287, 1289, 1297, 1301, 1306, 1309, 1311, 1314, 1319, 1325, 1328, 1330, 1333, 1337, 1356, 1357, 1358, 1360, 1362,