In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold, RFE, RFECV, f_regression, mutual_info_regression, SelectPercentile, SelectFromModel
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression as LR
from time import time
import global_config as cfg
import pandas as pd

In [11]:
def get_X_Y(csvfile):
    df = pd.read_csv(csvfile)
    print("Initial: ", df.shape)

    df = df.dropna(axis=1)
    print("DropNA: ", df.shape)
    df = df.drop_duplicates()
    print("Drop duplicates: ", df.shape)
    
    X = df.drop(['SMILES', 'Blood', 'Brain', 'Ratio'], axis=1).astype('float64')
    X = MinMaxScaler().fit_transform(X)
    # print(len(X))
    blood_y = df['Blood'].ravel()
    brain_y = df['Brain'].ravel()
    ratio_y = df['Ratio'].ravel()
    SMILES = df['SMILES']
    return X, blood_y, brain_y, ratio_y, SMILES

In [12]:
def get_VT(thres=0.02):
    # deleted all features that were either one or zero in more than 98% of samples
    selector = VarianceThreshold(thres)
    return selector

def get_RFE(n):
    from sklearn.ensemble import RandomForestRegressor as RFR
    
    # base estimator SVM
    # estimator = SVC(kernel="rbf")
    # estimator = LR(max_iter=10000, solver='liblinear', class_weight='balanced')
    estimator = RFR()
    selector = RFE(estimator=estimator, n_features_to_select=n)
    # selector = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(2),
    #           scoring='accuracy', n_jobs=-1)
    return selector

def get_UFE(percentile=80):
    selector = SelectPercentile(score_func=mutual_info_regression, percentile=percentile)
    return selector


def tree_based_selection(X, y):
    clf = ExtraTreesRegressor()
    clf = clf.fit(X, y)
    # print(clf.feature_importances_)
    # array([ 0.04...,  0.05...,  0.4...,  0.4...])
    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X)
    # print(X_new.shape)
    return X_new, clf.feature_importances_



In [13]:
desc_csvfilepath = cfg.ECCF_csvfilepath
X, blood_y, brain_y, ratio_y, SMILES = get_X_Y(desc_csvfilepath)
origin_X = X
y = blood_y
print(X.shape)

Initial:  (420, 3832)
DropNA:  (420, 3803)
Drop duplicates:  (364, 3803)
(364, 3799)


In [14]:
selector = get_VT(0.02)

X = selector.fit_transform(X, y)

print(X.shape)

(364, 1610)


In [15]:
X, fea = tree_based_selection(X, y)
print(X.shape)

(364, 127)


In [16]:
selector = get_UFE()

X = selector.fit_transform(X, y)
print(X.shape)
print(X)

(364, 101)
[[0.         0.34375    0.1235651  ... 0.         0.         0.        ]
 [0.         0.375      0.25167211 ... 0.         0.         0.        ]
 [0.         0.375      0.19822678 ... 0.         1.         0.        ]
 ...
 [0.         0.1875     0.18422772 ... 0.         0.         0.        ]
 [0.33333333 0.375      0.38712086 ... 0.         0.         0.        ]
 [0.         0.5625     0.53191269 ... 0.         0.         0.        ]]


In [17]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components = 0.95)
# X = pca.fit_transform(X)
# print(X.shape)
# print(X)

In [18]:
selector = get_RFE(50)

X = selector.fit_transform(X, y)
print(X.shape)
print(X)

(364, 50)
[[0.         0.18083146 0.15281024 ... 0.26196189 0.76923077 0.        ]
 [0.         0.3071882  0.2609079  ... 0.24522396 0.8        0.        ]
 [0.         0.23100553 0.21529457 ... 0.18019614 1.         0.        ]
 ...
 [0.         0.19762441 0.13322477 ... 0.45165843 0.66666667 0.        ]
 [0.33333333 0.49232029 0.42457957 ... 0.21651376 1.         0.        ]
 [0.         0.62256809 0.63487368 ... 0.35847417 1.         0.        ]]


In [19]:
X = pd.DataFrame(X)
origin_X = pd.DataFrame(origin_X)
# print(X.columns.to_list())
column_header = []
for idx, col in X.iteritems():
    for origin_idx, origin_col in origin_X.iteritems():
        if col.equals(origin_col):
            column_header.append(origin_idx)
            break
print(column_header)
print(len(column_header))

[3, 67, 78, 186, 243, 264, 270, 300, 334, 341, 359, 397, 449, 450, 457, 458, 502, 506, 542, 546, 565, 570, 572, 594, 602, 604, 610, 618, 630, 642, 648, 651, 678, 729, 755, 781, 792, 815, 986, 1060, 1136, 1139, 1143, 1163, 1226, 1246, 1278, 1539, 1552, 2544]
50
