In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
import itertools
from tqdm import tqdm
from sklearn.feature_selection import RFE

In [2]:
df = pd.read_csv('logP.csv')
y = df.logP
X = df.iloc[:, 3:]
feats = X.columns

In [3]:
def feature_selection(X, y, features: list, n_features = 2):

    r2_values = []

    for n_feat in tqdm(range(1, n_features+1)):

        combs = itertools.combinations(features, n_feat)

        for feature_subset in combs:

            model = linear_model.LinearRegression()

            X_train = X[[*feature_subset]]

            model.fit(X_train, y)
            r2_value = model.score(X_train, y)

            r2_values.append((n_feat, r2_value, ' '.join(feature_subset)))

    r2_df = pd.DataFrame(r2_values, columns=['n', 'r2', 'cols'])

    return r2_df.groupby('n').max()


In [4]:
model = linear_model.LinearRegression()

n_feats = 50
rfe = RFE(estimator=model, n_features_to_select= n_feats)

selected  = rfe.fit_transform(X, y)
selected_feature_indices = rfe.get_support()
selected_feature_names = X.columns[selected_feature_indices]

X_new = pd.DataFrame(selected, columns = selected_feature_names)
feats2 = X_new.columns
X_new

Unnamed: 0,QED_score,qed,MolWt,ExactMolWt,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,BCUT2D_LOGPLOW,...,NumHAcceptors,NumHeteroatoms,fr_Ar_NH,fr_N_O,fr_dihydropyridine,fr_imidazole,fr_ketone,fr_ketone_Topliss,fr_oxime,fr_tetrazole
0,0.909,0.909253,281.355,281.141579,-0.496454,0.496454,0.224838,1.095238,1.809524,-2.237256,...,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.839,0.839542,262.183,262.045293,-0.496624,0.496624,0.455288,1.444444,2.111111,-2.533527,...,4.0,7.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,0.828,0.828333,266.388,266.178299,-0.377661,0.377661,0.036092,0.650000,0.950000,-2.080683,...,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.726,0.726080,318.372,318.125594,-0.462534,0.462534,0.176481,0.875000,1.416667,-2.249719,...,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.816,0.816043,214.312,214.146999,-0.281751,0.281751,0.066847,1.125000,1.812500,-2.107302,...,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,0.863,0.863227,381.501,381.151098,-0.326039,0.326039,0.241916,1.148148,1.814815,-2.296841,...,4.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
240,0.737,0.737878,321.376,321.136493,-0.489991,0.489991,0.261809,0.875000,1.541667,-2.198811,...,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
241,0.824,0.824192,274.266,274.080536,-0.383653,0.383653,0.176462,0.850000,1.250000,-2.250916,...,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
242,0.750,0.750035,321.312,321.088080,-0.578172,0.578172,0.220747,1.166667,1.875000,-2.120952,...,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
res = feature_selection(X_new, y, feats2, 4)
res

100%|██████████| 4/4 [04:14<00:00, 63.51s/it]


Unnamed: 0_level_0,r2,cols
n,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302003,qed
2,0.456337,qed fr_tetrazole
3,0.494268,qed fr_oxime fr_tetrazole
4,0.514637,qed fr_ketone_Topliss fr_oxime fr_tetrazole
