# Basic imports

In [1]:
import pandas as pd
import os
import time
import math
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
current_path = os.getcwd()
root_path=current_path.replace('\\forward_feature_selection','')

In [3]:
mixed_df=pd.read_csv(root_path+"\\molecules.csv",sep="\t")

# Functions

In [4]:
def save_df_to_disk(df,name:str,separator="\t"):
    df.to_csv(name,sep=separator,index = False, header=True)

In [5]:
def get_data_and_true_prediction(df,not_wanted_features:list):
        temp_df=df.drop(not_wanted_features,axis=1)
        y=temp_df[temp_df.columns[-1]]
        x=temp_df.drop([temp_df.columns[-1]],axis=1)
        
        return x,y   

In [6]:
def get_df_with_name_and_prediction(df,true_prediction,big_df):
    new_df=df
    new_df.insert(0,"m_name",big_df["m_name"].values)
    new_df=new_df.join(true_prediction)
    
    return new_df

In [7]:
def get_roc_auc_score(x,y,model): # gets roc auc average
        cv_results = cross_validate(model, x, y, cv=10,scoring=('roc_auc'))
        roc_auc_avrg=cv_results['test_score'].mean()
        
        return roc_auc_avrg 

In [8]:
def get_best_score_index(score_lst:list):
    max_score = max(score_lst) # best score
    max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        
    return max_score_index[0]

In [9]:
def show_results(df,roc,f_var,s_var):
    print("\n------------Results-------------\n")
    print("Best model when first param is {} and second param is {}".format(f_var,s_var))
    print("Roc auc = {}".format(roc))
    print("\nFeatures\n")
    for cnt,col in enumerate(df.columns):
        print("{}. {}".format(cnt+1,col))
    print("")

In [10]:
def left_partition_is_better(first_score,middle_score,last_score):
    if ((first_score >= middle_score) and (last_score <= middle_score) and (first_score >= last_score)): return True               
    elif ((first_score <= middle_score) and (middle_score >= last_score) and(last_score <= first_score)): return True                   
    elif ((first_score >= middle_score) and (middle_score <= last_score) and (last_score >= first_score)): return False                  
    elif ((first_score >= middle_score) and (middle_score <= last_score) and (first_score >= last_score)): return True                
    else: return False

In [11]:
def get_model(first_prm,second_prm):
    return KNeighborsClassifier(n_neighbors=first_prm,p=second_prm,metric='minkowski')  

# Forward selection

In [12]:
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier

In [13]:
def forward_selection(x,y,model): # O(n) worst case scenario, where n depends on len(x.columns)
    
    def first_iteration(x,y,model):
        score_lst = []
        for i in range(len(x.columns)):
            k = x.columns[i]
            temp_x = x[[k]]
            score = get_roc_auc_score(temp_x,y,model)
            score_lst.append(score)
    
        max_score = max(score_lst) # best score
        max_score_index = [i for i, j in enumerate(score_lst) if j == max_score] # indx with best score
        top_score_index = max_score_index[0]
        new_feature = x.columns[top_score_index] 
        best_x = x[new_feature]
        new_x = x.drop(new_feature,axis=1)
        
        return best_x,new_x,max_score
    
    def else_iteration(best_x,x,y,model,actual_score):     
        new_x_lenght = len(x.columns)
        if (new_x_lenght > 0):
            score_lst = []
            for i in range(new_x_lenght):
                k = x.columns[i]
                temp_x = x[[k]]
                temp_new_x = pd.concat([best_x,temp_x],axis=1, ignore_index=True)
                score = get_roc_auc_score(temp_new_x,y,model)
                score_lst.append(score)

            max_score = max(score_lst) # best score

            if(max_score<actual_score):
                return best_x,actual_score # break condition, recursive function

            max_score_index = [i for i, j in enumerate(score_lst) if j == max_score] # indx with best score
            top_score_index = max_score_index[0]

            new_feature = x.columns[top_score_index]
            temp_x = x[new_feature]
            best_x = pd.concat([best_x,temp_x],axis=1)
            new_x = x.drop(new_feature,axis=1)

            return else_iteration(best_x,new_x,y,model,max_score)
        
        return best_x,actual_score
    
    f_best_x,new_x,f_score = first_iteration(x,y,model)
    best_x,best_score = else_iteration(f_best_x,new_x,y,model,f_score)
    
    return best_x,best_score

In [14]:
def inner_forward_partition(x,y,variable_prm_lst:list,static_prm): # (training data, prediction, variable list, static var)

    def else_iteration(x,y,variable_prm_lst:list,first_score,last_score,static_prm):       
        print("\ninner variable param list -> {}".format(variable_prm_lst))  
                           
        if (len(variable_prm_lst) == 2):
            print("\n---------inner param last iteration---------\n")
            first = variable_prm_lst[0]
            print("first is {}".format(first))
            last = variable_prm_lst[-1]
            print("last is {}".format(last))
            
            clf_first = get_model(static_prm,first) 
            first_df,first_score = forward_selection(x,y,clf_first)
            clf_last = get_model(static_prm,last) 
            last_df,last_score = forward_selection(x,y,clf_last)

            print("left score: {} | right score: {}".format(first_score,last_score))
            
            if (last_score > first_score):
                print("last score is greater than first score")
                show_results(last_df,last_score,static_prm,last)
                return last_df,last_score,last

            print("first score is greater than last score")
            show_results(first_df,first_score,static_prm,first)
            return first_df,first_score,first
        else:        
            if(len(variable_prm_lst) % 2 == 0): 
                first = variable_prm_lst[0]
                print("first is {}".format(first))
                middle = variable_prm_lst[(len(variable_prm_lst)//2)-1]
                print("middle is {}".format(middle))
                last = variable_prm_lst[-1]
                print("last is {}".format(last))
   
                clf_middle = get_model(static_prm,middle) 
                _,middle_score = forward_selection(x,y,clf_middle)
        
                print("left score: {} | middle score:{} | right score: {}".format(first_score,middle_score,last_score))

                if (left_partition_is_better(first_score,middle_score,last_score)):             
                    print("taken left: {}".format(variable_prm_lst[:(len(variable_prm_lst)//2)]))
                    return else_iteration(x,y,variable_prm_lst[:(len(variable_prm_lst)//2)],first_score,middle_score,static_prm)          
                
                print("taken right: {}".format(variable_prm_lst[(len(variable_prm_lst)//2)-1:]))
                return else_iteration(x,y,variable_prm_lst[(len(variable_prm_lst)//2)-1:],middle_score,last_score,static_prm)
            else:
                first = variable_prm_lst[0]
                print("first is {}".format(first))
                middle = variable_prm_lst[(len(variable_prm_lst)//2)]
                print("middle is {}".format(middle))
                last = variable_prm_lst[-1]
                print("last is {}".format(last))
    
                clf_middle = get_model(static_prm,middle) 
                _,middle_score = forward_selection(x,y,clf_middle)      
                
                print("left score: {} | middle score:{} | right score: {}".format(first_score,middle_score,last_score))
                
                if (left_partition_is_better(first_score,middle_score,last_score)):
                    print("taken left: {}".format(variable_prm_lst[:(len(variable_prm_lst)//2)+1]))
                    return else_iteration(x,y,variable_prm_lst[:(len(variable_prm_lst)//2)+1],first_score,middle_score,static_prm)
            
                print("taken right: {}".format(variable_prm_lst[(len(variable_prm_lst)//2):]))
                return else_iteration(x,y,variable_prm_lst[(len(variable_prm_lst)//2):],middle_score,last_score,static_prm)
                   
    def first_iteration(x,y,variable_prm_lst:list,static_prm):
        print("\ninner variable param list -> {}".format(variable_prm_lst)) 
        
        if(len(variable_prm_lst) % 2 == 0):       
            first = variable_prm_lst[0]
            print("first is {}".format(first))
            middle = variable_prm_lst[(len(variable_prm_lst)//2)-1]
            print("middle is {}".format(middle))
            last = variable_prm_lst[-1]
            print("last is {}".format(last))
            
            clf_first = get_model(static_prm,first) 
            _,first_score = forward_selection(x,y,clf_first)       
            clf_middle = get_model(static_prm,middle) 
            _,middle_score = forward_selection(x,y,clf_middle)
            clf_last = get_model(static_prm,last) 
            _,last_score = forward_selection(x,y,clf_last)
            
            if(left_partition_is_better(first_score,middle_score,last_score)):
                print("taken left: {}".format(variable_prm_lst[:(len(variable_prm_lst)//2)]))
                return variable_prm_lst[:(len(variable_prm_lst)//2)],first_score,middle_score
          
            print("taken right: {}".format(variable_prm_lst[(len(variable_prm_lst)//2)-1:]))
            return variable_prm_lst[(len(variable_prm_lst)//2)-1:],middle_score,last_score
        
        else:
            first = variable_prm_lst[0]
            print("first is {}".format(first))
            middle = variable_prm_lst[(len(variable_prm_lst)//2)]
            print("middle is {}".format(middle))
            last = variable_prm_lst[-1]
            print("last is {}".format(last))
            
            clf_first = get_model(static_prm,first)  
            _,first_score = forward_selection(x,y,clf_first)       
            clf_middle = get_model(static_prm,middle) 
            _,middle_score = forward_selection(x,y,clf_middle)
            clf_last = get_model(static_prm,last) 
            _,last_score = forward_selection(x,y,clf_last)
            
            if (left_partition_is_better(first_score,middle_score,last_score)):               
                print("taken left: {}".format(variable_prm_lst[:(len(variable_prm_lst)//2)+1]))
                return variable_prm_lst[:(len(variable_prm_lst)//2)+1],first_score,middle_score
         
            print("taken right: {}".format(variable_prm_lst[(len(variable_prm_lst)//2):]))
            return variable_prm_lst[(len(variable_prm_lst)//2):],middle_score,last_score

    if(len(variable_prm_lst) > 1 and len(variable_prm_lst) <=2):
        print("\nfirst param is {}".format(static_prm))
        best_df,best_score,best_variable_prm = else_iteration(x,y,variable_prm_lst,0,0,static_prm)
        return best_df,best_score,best_variable_prm                          
    else:
        print("\nouter param is {}".format(static_prm))
        new_variable_prm_lst,first_score,last_score = first_iteration(x,y,variable_prm_lst,static_prm)
        print("\nout prms from inner first iteration => variable_prm_lst: {} | left score: {} | right score: {}".format(new_variable_prm_lst,first_score,last_score))
        best_df,best_score,best_variable_prm = else_iteration(x,y,new_variable_prm_lst,first_score,last_score,static_prm)
        return best_df,best_score,best_variable_prm                   

In [15]:
def forward_partition(x,y,K:list,P:list): # (training data, prediction, outer variable list, inner variable list)
    
    def first_iteration(x,y,K:list,P:list):
        print("outer param list -> {}".format(K))
        
        if(len(K) % 2 == 0):
            first = K[0]
            print("first is {}".format(first))
            middle = K[(len(K)//2)-1]
            print("middle is {}".format(middle))
            last = K[-1]
            print("last is {}".format(last))
            
            print("\n**************************")
            _,first_score,first_best_p = inner_forward_partition(x,y,P,first)
            print("\n**************************")
            _,middle_score,middle_best_p = inner_forward_partition(x,y,P,middle)
            print("\n**************************")
            _,last_score,last_best_p = inner_forward_partition(x,y,P,last)
            
            print("left score: {} | middle score:{} | right score: {}".format(first_score,middle_score,last_score))
            
            if (left_partition_is_better(first_score,middle_score,last_score)):          
                print("taken left: {}".format(K[:(len(K)//2)]))
                return K[:(len(K)//2)],first_score,middle_score
          
            print("taken right: {}".format(K[(len(K)//2)-1:]))
            return K[(len(K)//2)-1:],middle_score,last_score
        else:
            first = K[0]
            print("first is {}".format(first))
            middle = K[(len(K)//2)]
            print("middle is {}".format(middle))
            last = K[-1]
            print("last is {}".format(last))
            
            print("\n**************************")
            _,first_score,first_best_p = inner_forward_partition(x,y,P,first)
            print("\n**************************")
            _,middle_score,middle_best_p = inner_forward_partition(x,y,P,middle)
            print("\n**************************")
            _,last_score,last_best_p = inner_forward_partition(x,y,P,last)
            
            print("left score: {} | middle score:{} | right score: {}".format(first_score,middle_score,last_score))
            
            if (left_partition_is_better(first_score,middle_score,last_score)):
                print("taken left: {}".format(K[:(len(K)//2)+1]))
                return K[:(len(K)//2)+1],first_score,middle_score
         
            print("taken right: {}".format(K[(len(K)//2):]))
            return K[(len(K)//2):],middle_score,last_score

    def else_iteration(x,y,K:list,P:list,first_score,last_score):
        
        if(len(K) == 2):          
            print("\n---------outer param last iteration---------\n")
            first = K[0]
            print("first is {}".format(first))
            last = K[-1]
            print("last is {}".format(last))
            print("\n**************************")
            first_df,first_score,first_best_p = inner_forward_partition(x,y,P,first)
            print("\n**************************")
            last_df,last_score,last_best_p = inner_forward_partition(x,y,P,last) 
            print("")
            
            if (last_score > first_score):
                print("last score is greater than first score")
                return last_df,last_score,last,last_best_p

            print("first score is greater than last score")
            return first_df,first_score,first,first_best_p        
        else:
            print("first param list -> {}".format(K))  
        
            if(len(K) % 2 == 0):
                first = K[0]
                print("first is {}".format(first))
                middle = K[(len(K)//2)-1]
                print("middle is {}".format(middle))
                last = K[-1]
                print("last is {}".format(last))
                
                print("\n**************************")
                _,middle_score,middle_best_p = inner_forward_partition(x,y,P,middle)
                
                print("left score: {} | middle score:{} | right score: {}".format(first_score,middle_score,last_score))
                
                if (left_partition_is_better(first_score,middle_score,last_score)):                  
                    print("taken left: {}".format(K[:(len(K)//2)]))
                    return else_iteration(x,y,K[:(len(K)//2)],P,first_score,middle_score)
                
                print("taken right: {}".format(K[(len(K)//2)-1:]))
                return else_iteration(x,y,K[(len(K)//2)-1:],P,middle_score,last_score)
            else:
                first = K[0]
                print("first is {}".format(first))
                middle = K[(len(K)//2)]
                print("middle is {}".format(middle))
                last = K[-1]
                print("last is {}".format(last))
                
                print("\n**************************")
                _,middle_score,middle_best_p = inner_forward_partition(x,y,P,middle)
                
                print("left score: {} | middle score:{} | right score: {}".format(first_score,middle_score,last_score))

                if (left_partition_is_better(first_score,middle_score,last_score)):                    
                    print("taken left: {}".format(K[:(len(K)//2)+1]))
                    return else_iteration(x,y,K[:(len(K)//2)+1],P,first_score,middle_score)
               
                print("taken right: {}".format(K[(len(K)//2):]))
                return else_iteration(x,y,K[(len(K)//2):],P,middle_score,last_score)
    
    if(len(K) > 1 and len(K) <= 2):
        best_df,best_score,best_first_var,best_second_var = else_iteration(x,y,K,P,0,0)
        show_results(best_df,best_score,best_first_var,best_second_var)
        return [best_df,best_score,best_first_var,best_second_var]                          
    else:
        new_lst,first_score,last_score = first_iteration(x,y,K,P)
        print("\nout prms from outer first iteration => lst: {} | left score: {} | right score: {}".format(new_lst,first_score,last_score))
        best_df,best_score,best_first_var,best_second_var = else_iteration(x,y,new_lst,P,first_score,last_score)
        show_results(best_df,best_score,best_first_var,best_second_var)
        return [best_df,best_score,best_first_var,best_second_var]  

In [29]:
def guarantee_results(x,y,K:list,P:list):    
    best_outer_prm_lst = []
    for i in K:
        print("----------------------------\n")
        print("First param is {}\n".format(i))
        best_inner_prm_lst = []
        for j in P:
            print("----------------------------\n")
            print("Second param is {}".format(j))
            clf = get_model(i,j)
            best_df,best_score = forward_selection(x,y,clf)
            print("ROC AUC = {}".format(best_score))
            print("\nFeatures\n")
            for c,d in enumerate(best_df.columns):
                print("{}. {}".format(c+1,d))       
            print("")
            best_inner_prm_lst.append([best_df,i,j,best_score])
            
        score_lst = [d  for [a,b,c,d] in best_inner_prm_lst]
        index = get_best_score_index(score_lst)
        best_outer_prm_lst.append([best_inner_prm_lst[index][0],best_inner_prm_lst[index][1],best_inner_prm_lst[index][2],best_inner_prm_lst[index][3]])
        
    score_lst = [d  for [a,b,c,d] in best_outer_prm_lst]
    index = get_best_score_index(score_lst)
    print("\n------------ Final results ----------------")
    print("Best model when first param = {} ,second param = {} ,ROC AUC = {}".format(best_outer_prm_lst[index][1],best_outer_prm_lst[index][2],best_outer_prm_lst[index][3]))
    
    print("\nFeatures\n")
    for c,d in enumerate(best_outer_prm_lst[index][0].columns):
        print("{}. {}".format(c+1,d))       
            
    return [best_outer_prm_lst[index][0],best_outer_prm_lst[index][1],best_outer_prm_lst[index][2],best_outer_prm_lst[index][3]]

In [24]:
def for_TUKEY_TEST(x,y,f,s):
    model = get_model(f,s)
    cv_results = cross_validate(model, x, y, cv=10,scoring=('roc_auc'))
    rslts = cv_results['test_score']
    print(rslts)
    return list(rslts)

# Data

In [18]:
unnecesary_features=["m_name"]
x,y=get_data_and_true_prediction(mixed_df,unnecesary_features)

In [19]:
max_k = 30
max_p = 20

K = [i for i in range(1,max_k+1)]
P = [i for i in range(1,max_p+1)]

p = 1, Manhattan Distance
p = 2, Euclidean Distance
p = infinite, Chebychev Distance

In [20]:
start = time.time()
best_data = forward_partition(x,y,K,P)
end = time.time()

time.strftime('%H:%M:%S', time.gmtime(end-start))

outer param list -> [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
first is 1
middle is 15
last is 30

**************************

outer param is 1

inner variable param list -> [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
first is 1
middle is 10
last is 20
taken right: [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

out prms from inner first iteration => variable_prm_lst: [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] | left score: 0.7711413530465949 | right score: 0.7711413530465949

inner variable param list -> [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
first is 10
middle is 15
last is 20
left score: 0.7711413530465949 | middle score:0.7711413530465949 | right score: 0.7711413530465949
taken left: [10, 11, 12, 13, 14, 15]

inner variable param list -> [10, 11, 12, 13, 14, 15]
first is 10
middle is 12
last is 15
left score: 0.7711413530465949 | middle score:0.7711413530465949 | right score: 0.77

left score: 0.8856597790000496 | middle score:0.8852609792833194 | right score: 0.885623544422972
taken left: [1, 2, 3]

inner variable param list -> [1, 2, 3]
first is 1
middle is 2
last is 3
left score: 0.8856597790000496 | middle score:0.8847719544373419 | right score: 0.8852609792833194
taken left: [1, 2]

inner variable param list -> [1, 2]

---------inner param last iteration---------

first is 1
last is 2
left score: 0.8856597790000496 | right score: 0.8847719544373419
first score is greater than last score

------------Results-------------

Best model when first param is 18 and second param is 1
Roc auc = 0.8856597790000496

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. m_logp
6. n_atoms_stereo_centers
7. n_aliphatic_heterocycles
8. n_saturated_rings
9. n_briged_head_atoms
10. fraction_CSP3
11. n_saturated_heterocycles
12. n_HBA
13. n_saturated_carbocycles

left score: 0.8839001696108552 | middle score:0.8856597790000496 | right score: 0.884841249194787
t

'00:29:27'

In [25]:
fp_rslts = for_TUKEY_TEST(best_data[0],y,best_data[2],best_data[-1])

[0.87698413 0.9000496  0.87301587 0.88541667 0.88785282 0.89642137
 0.84781478 0.88007284 0.93834547 0.87539022]


In [30]:
start = time.time()
g_data = guarantee_results(x,y,K,P)
end = time.time()

time.strftime('%H:%M:%S', time.gmtime(end-start))

----------------------------

First param is 1

----------------------------

Second param is 1
ROC AUC = 0.7602166538658475

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_aliphatic_carbocycles
4. n_O
5. n_saturated_rings
6. n_amide_bonds
7. n_briged_head_atoms

----------------------------

Second param is 2
ROC AUC = 0.7577596966205837

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_aliphatic_carbocycles
4. n_O
5. n_saturated_rings
6. n_amide_bonds
7. n_briged_head_atoms

----------------------------

Second param is 3
ROC AUC = 0.7711413530465949

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_aliphatic_carbocycles
4. n_O
5. n_saturated_rings
6. n_amide_bonds
7. n_briged_head_atoms
8. n_atoms_without_Hydrogen
9. n_aliphatic_rings
10. n_saturated_carbocycles

----------------------------

Second param is 4
ROC AUC = 0.7711413530465949

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_aliphatic_carbocycles
4. n_O
5. n_saturated_ri

ROC AUC = 0.8289753253885669

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_atoms_without_Hydrogen
4. fraction_CSP3
5. n_hetero_cycles
6. n_saturated_heterocycles
7. n_aromatic_rings
8. n_HOH
9. n_aromatic_carbocycles
10. n_aromatic_heterocycles

----------------------------

Second param is 10
ROC AUC = 0.8372956643418729

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_atoms_without_Hydrogen
4. fraction_CSP3
5. n_hetero_cycles
6. n_saturated_heterocycles
7. n_strict_rotable_bonds
8. n_O
9. n_amide_bonds
10. n_aromatic_carbocycles
11. n_briged_head_atoms

----------------------------

Second param is 11
ROC AUC = 0.8359623144913535

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_atoms_without_Hydrogen
4. fraction_CSP3
5. n_hetero_cycles
6. n_saturated_heterocycles
7. n_strict_rotable_bonds
8. n_O
9. n_amide_bonds
10. n_rings
11. n_aromatic_carbocycles

----------------------------

Second param is 12
ROC AUC = 0.8358639469055053

Features

1. 

ROC AUC = 0.8706261845919429

Features

1. n_O
2. n_aliphatic_heterocycles
3. n_strict_rotable_bonds
4. n_atoms_stereo_centers
5. n_HOH
6. n_hetero_cycles
7. n_HBA
8. n_briged_head_atoms
9. n_saturated_heterocycles
10. fraction_CSP3

----------------------------

Second param is 18
ROC AUC = 0.8692909507498803

Features

1. n_O
2. n_aliphatic_heterocycles
3. n_strict_rotable_bonds
4. n_atoms_stereo_centers
5. n_HOH
6. n_hetero_cycles
7. n_HBA
8. n_briged_head_atoms
9. fraction_CSP3
10. n_saturated_heterocycles

----------------------------

Second param is 19
ROC AUC = 0.8728758769593512

Features

1. n_O
2. n_aliphatic_heterocycles
3. n_strict_rotable_bonds
4. n_atoms_stereo_centers
5. n_HOH
6. n_hetero_cycles
7. n_HBA
8. n_briged_head_atoms
9. fraction_CSP3
10. n_saturated_heterocycles
11. n_amide_bonds
12. n_HBD

----------------------------

Second param is 20
ROC AUC = 0.8720582246296023

Features

1. n_O
2. n_aliphatic_heterocycles
3. n_strict_rotable_bonds
4. n_atoms_stereo_cent

ROC AUC = 0.8929529941529161

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_aromatic_carbocycles
4. fraction_CSP3
5. n_saturated_carbocycles
6. n_HOH
7. n_O
8. n_briged_head_atoms
9. n_non_strict_rotable_bonds
10. n_amide_bonds
11. n_aliphatic_rings
12. n_HBA
13. n_hetero_cycles

----------------------------

Second param is 5
ROC AUC = 0.8929529941529161

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_aromatic_carbocycles
4. fraction_CSP3
5. n_saturated_carbocycles
6. n_HOH
7. n_O
8. n_briged_head_atoms
9. n_non_strict_rotable_bonds
10. n_hetero_cycles
11. n_HBA
12. n_aliphatic_rings
13. n_amide_bonds

----------------------------

Second param is 6
ROC AUC = 0.8929529941529161

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_aromatic_carbocycles
4. fraction_CSP3
5. n_saturated_carbocycles
6. n_HOH
7. n_O
8. n_briged_head_atoms
9. n_non_strict_rotable_bonds
10. n_amide_bonds
11. n_aliphatic_rings
12. n_HBA
13. n_hetero_cycles

----------------

ROC AUC = 0.8784497433642204

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. m_logp
4. n_HBD
5. n_O
6. n_amide_bonds
7. n_aromatic_carbocycles
8. n_hetero_cycles

----------------------------

Second param is 12
ROC AUC = 0.8791191152362783

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. m_logp
4. n_HBD
5. n_O
6. n_amide_bonds
7. n_aromatic_carbocycles
8. n_hetero_cycles

----------------------------

Second param is 13
ROC AUC = 0.8792304513940505

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. m_logp
4. n_HBD
5. n_aromatic_carbocycles
6. n_O
7. n_hetero_cycles
8. n_amide_bonds

----------------------------

Second param is 14
ROC AUC = 0.8791296449424376

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. m_logp
4. n_HBD
5. n_O
6. n_amide_bonds
7. n_aromatic_carbocycles
8. n_hetero_cycles

----------------------------

Second param is 15
ROC AUC = 0.8787413104116084

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. m_logp
4. n_HBD

ROC AUC = 0.8821499414672216

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. m_logp
5. n_HBD
6. n_amide_bonds
7. n_saturated_rings
8. n_briged_head_atoms
9. n_atoms_stereo_centers

----------------------------

Second param is 2
ROC AUC = 0.8754704559156302

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. n_saturated_carbocycles
5. n_briged_head_atoms
6. n_HBD
7. n_aromatic_carbocycles
8. n_HOH

----------------------------

Second param is 3
ROC AUC = 0.8816012999025487

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. m_logp
5. n_atoms_stereo_centers
6. n_HBD
7. n_hetero_cycles
8. n_briged_head_atoms
9. n_saturated_heterocycles
10. n_aliphatic_rings

----------------------------

Second param is 4
ROC AUC = 0.8830331850915878

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. m_logp
5. n_atoms_stereo_centers
6. n_HBD
7. n_aliphatic_rings
8. n_saturated_heterocycles
9. n_briged_head_atoms
10. n_amide_bonds

--------

ROC AUC = 0.8874210659159607

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. m_logp
5. n_HBD
6. n_atoms_stereo_centers
7. n_briged_head_atoms
8. n_hetero_cycles
9. n_aromatic_carbocycles
10. n_amide_bonds
11. n_saturated_heterocycles

----------------------------

Second param is 13
ROC AUC = 0.8874543583692581

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. m_logp
5. n_HBD
6. n_atoms_stereo_centers
7. n_briged_head_atoms
8. n_hetero_cycles
9. n_aromatic_carbocycles
10. n_amide_bonds
11. n_saturated_heterocycles

----------------------------

Second param is 14
ROC AUC = 0.8869165433039328

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. m_logp
5. n_HBD
6. n_atoms_stereo_centers
7. n_briged_head_atoms
8. n_hetero_cycles
9. n_aromatic_carbocycles
10. n_amide_bonds
11. n_saturated_heterocycles

----------------------------

Second param is 15
ROC AUC = 0.8864226665345292

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_

ROC AUC = 0.8979427070429281

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. n_HBD
5. n_non_strict_rotable_bonds
6. n_atoms_stereo_centers
7. n_hetero_cycles
8. n_amide_bonds
9. n_briged_head_atoms
10. n_aromatic_carbocycles
11. fraction_CSP3

----------------------------

Second param is 20
ROC AUC = 0.8984061560585038

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. n_HBD
5. n_non_strict_rotable_bonds
6. n_atoms_stereo_centers
7. n_hetero_cycles
8. n_amide_bonds
9. n_briged_head_atoms
10. n_aromatic_carbocycles
11. fraction_CSP3

----------------------------

First param is 11

----------------------------

Second param is 1
ROC AUC = 0.8836964275391045

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. n_aromatic_carbocycles
5. m_logp
6. n_HBD
7. n_saturated_heterocycles
8. n_atoms_stereo_centers
9. n_aliphatic_rings
10. n_briged_head_atoms

----------------------------

Second param is 2
ROC AUC = 0.879023031667575

Features

1. 

ROC AUC = 0.8884331177840543

Features

1. n_HBD
2. n_hetero_atoms
3. n_aliphatic_heterocycles
4. n_aromatic_carbocycles
5. n_strict_rotable_bonds
6. n_amide_bonds
7. n_briged_head_atoms
8. n_atoms_stereo_centers
9. n_hetero_cycles
10. n_O
11. n_aromatic_rings

----------------------------

Second param is 13
ROC AUC = 0.8884331177840543

Features

1. n_HBD
2. n_hetero_atoms
3. n_aliphatic_heterocycles
4. n_aromatic_carbocycles
5. n_strict_rotable_bonds
6. n_amide_bonds
7. n_briged_head_atoms
8. n_atoms_stereo_centers
9. n_hetero_cycles
10. n_O
11. n_aromatic_rings

----------------------------

Second param is 14
ROC AUC = 0.8884331177840543

Features

1. n_HBD
2. n_hetero_atoms
3. n_aliphatic_heterocycles
4. n_aromatic_carbocycles
5. n_strict_rotable_bonds
6. n_amide_bonds
7. n_briged_head_atoms
8. n_atoms_stereo_centers
9. n_hetero_cycles
10. n_O
11. n_aromatic_rings

----------------------------

Second param is 15
ROC AUC = 0.8884331177840543

Features

1. n_HBD
2. n_hetero_atoms


ROC AUC = 0.8926195018416662

Features

1. n_HBD
2. n_hetero_atoms
3. n_aliphatic_heterocycles
4. n_non_strict_rotable_bonds
5. n_amide_bonds
6. n_atoms_stereo_centers
7. n_briged_head_atoms
8. n_aromatic_carbocycles
9. n_hetero_cycles
10. n_O
11. fraction_CSP3

----------------------------

Second param is 20
ROC AUC = 0.8924978940587682

Features

1. n_HBD
2. n_hetero_atoms
3. n_aliphatic_heterocycles
4. n_non_strict_rotable_bonds
5. n_amide_bonds
6. n_atoms_stereo_centers
7. n_briged_head_atoms
8. n_aromatic_carbocycles
9. n_hetero_cycles
10. n_O
11. n_saturated_carbocycles

----------------------------

First param is 14

----------------------------

Second param is 1
ROC AUC = 0.8859599014336919

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. n_saturated_carbocycles
5. n_briged_head_atoms
6. n_HBA
7. n_aromatic_carbocycles
8. m_logp
9. fraction_CSP3
10. n_saturated_heterocycles
11. n_atoms_stereo_centers
12. n_aliphatic_rings
13. n_saturated_rings

---------------------

ROC AUC = 0.8838014019994054

Features

1. n_HBD
2. n_aliphatic_rings
3. m_logp
4. n_O
5. n_aliphatic_heterocycles
6. n_briged_head_atoms
7. n_atoms_stereo_centers
8. n_HBA
9. n_hetero_cycles
10. n_saturated_carbocycles

----------------------------

Second param is 8
ROC AUC = 0.8839530375105298

Features

1. n_HBD
2. n_aliphatic_rings
3. m_logp
4. n_O
5. n_aliphatic_heterocycles
6. n_briged_head_atoms
7. n_atoms_stereo_centers
8. n_hetero_cycles
9. n_HBA
10. n_saturated_carbocycles

----------------------------

Second param is 9
ROC AUC = 0.8840530439109393

Features

1. n_HBD
2. n_aliphatic_rings
3. m_logp
4. n_O
5. n_aliphatic_heterocycles
6. n_atoms_stereo_centers
7. n_briged_head_atoms
8. n_HBA
9. n_hetero_cycles
10. n_saturated_carbocycles

----------------------------

Second param is 10
ROC AUC = 0.8837107639405051

Features

1. n_HBD
2. n_aliphatic_rings
3. m_logp
4. n_O
5. n_aliphatic_heterocycles
6. n_atoms_stereo_centers
7. n_briged_head_atoms
8. n_HBA
9. n_hetero_cycles


ROC AUC = 0.8832492118205242

Features

1. n_HBD
2. n_aliphatic_rings
3. n_HBA
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_amide_bonds
7. n_strict_rotable_bonds
8. n_saturated_carbocycles
9. fraction_CSP3
10. n_aliphatic_carbocycles

----------------------------

Second param is 19
ROC AUC = 0.8854806836669805

Features

1. n_HBD
2. n_aliphatic_rings
3. n_HBA
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_amide_bonds
7. n_strict_rotable_bonds
8. n_saturated_carbocycles
9. fraction_CSP3

----------------------------

Second param is 20
ROC AUC = 0.8862376095295905

Features

1. n_HBD
2. n_aliphatic_rings
3. n_HBA
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_amide_bonds
7. n_strict_rotable_bonds
8. n_saturated_carbocycles
9. fraction_CSP3

----------------------------

First param is 17

----------------------------

Second param is 1
ROC AUC = 0.8849393457955502

Features

1. n_HBD
2. n_aliphatic_rings
3. n_HBA
4. n_hetero_cycles
5. n_primary_carbon_atoms
6. n_aliphatic_heterocycle

ROC AUC = 0.8853894133095486

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_non_strict_rotable_bonds
7. n_saturated_carbocycles
8. fraction_CSP3
9. n_amide_bonds
10. n_aliphatic_carbocycles

----------------------------

Second param is 9
ROC AUC = 0.8852633794369291

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_non_strict_rotable_bonds
7. n_saturated_carbocycles
8. fraction_CSP3
9. n_amide_bonds
10. n_aliphatic_carbocycles

----------------------------

Second param is 10
ROC AUC = 0.8852129762111225

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_non_strict_rotable_bonds
7. n_saturated_carbocycles
8. fraction_CSP3
9. n_amide_bonds
10. n_aliphatic_carbocycles

----------------------------

Second param is 11
ROC AUC = 0.884748365830897

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_non_strict_rot

ROC AUC = 0.88413772029797

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_non_strict_rotable_bonds
7. n_aliphatic_carbocycles
8. n_amide_bonds
9. n_aromatic_heterocycles

----------------------------

Second param is 19
ROC AUC = 0.88413772029797

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_non_strict_rotable_bonds
7. n_aliphatic_carbocycles
8. n_amide_bonds
9. n_aromatic_heterocycles

----------------------------

Second param is 20
ROC AUC = 0.8850974591406107

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_non_strict_rotable_bonds
7. n_aliphatic_carbocycles
8. n_amide_bonds
9. fraction_CSP3

----------------------------

First param is 20

----------------------------

Second param is 1
ROC AUC = 0.8825763558545168

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. m_logp
6. n_atoms_stereo_centers
7. n_aliphatic_heterocyc

ROC AUC = 0.8704342368234148

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. n_saturated_carbocycles
5. n_briged_head_atoms
6. n_HBD
7. n_aromatic_carbocycles

----------------------------

Second param is 11
ROC AUC = 0.8704342368234148

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. n_saturated_carbocycles
5. n_briged_head_atoms
6. n_HBD
7. n_aromatic_carbocycles

----------------------------

Second param is 12
ROC AUC = 0.8781105216540311

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. fraction_CSP3
5. n_amide_bonds
6. n_atoms_stereo_centers
7. n_briged_head_atoms
8. n_HBD
9. n_saturated_heterocycles

----------------------------

Second param is 13
ROC AUC = 0.8704342368234148

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. n_saturated_carbocycles
5. n_briged_head_atoms
6. n_HBD
7. n_aromatic_carbocycles

----------------------------

Second param is 14
ROC AUC = 0.8826155970756322

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. n_

ROC AUC = 0.8841155511372083

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_non_strict_rotable_bonds
7. n_aromatic_heterocycles
8. n_aliphatic_carbocycles
9. n_amide_bonds
10. n_saturated_carbocycles
11. n_aliphatic_heterocycles
12. fraction_CSP3

----------------------------

Second param is 19
ROC AUC = 0.8841830909436268

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_non_strict_rotable_bonds
7. n_aromatic_heterocycles
8. n_aliphatic_carbocycles
9. n_amide_bonds
10. n_saturated_carbocycles
11. n_aliphatic_heterocycles
12. fraction_CSP3

----------------------------

Second param is 20
ROC AUC = 0.884841249194787

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_non_strict_rotable_bonds
7. n_aromatic_heterocycles
8. n_aliphatic_carbocycles
9. n_amide_bonds
10. n_saturated_carbocycles
11. n_aliphatic_heterocycles
12. fraction_CSP3

---------------

ROC AUC = 0.8790849711155708

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. m_logp
5. n_aromatic_carbocycles
6. n_atoms_stereo_centers
7. n_briged_head_atoms

----------------------------

Second param is 2
ROC AUC = 0.875062933059974

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. fraction_CSP3
5. n_amide_bonds
6. n_saturated_carbocycles
7. n_briged_head_atoms

----------------------------

Second param is 3
ROC AUC = 0.8745291569628199

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. fraction_CSP3
5. n_atoms_stereo_centers
6. n_amide_bonds
7. n_aromatic_carbocycles
8. n_briged_head_atoms
9. n_saturated_carbocycles

----------------------------

Second param is 4
ROC AUC = 0.8746881477833937

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. fraction_CSP3
5. n_atoms_stereo_centers
6. n_amide_bonds
7. n_aromatic_carbocycles
8. n_briged_head_atoms

----------------------------

Second param is 5
ROC AUC = 0.8747088071701106

Features

1. n_HOH
2. n_a

ROC AUC = 0.8796639630097618

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. fraction_CSP3
5. n_atoms_stereo_centers
6. n_briged_head_atoms
7. n_amide_bonds
8. n_hetero_cycles
9. n_HBD
10. n_non_strict_rotable_bonds
11. n_aliphatic_carbocycles
12. n_saturated_carbocycles

----------------------------

Second param is 12
ROC AUC = 0.8795623306988422

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. fraction_CSP3
5. n_atoms_stereo_centers
6. n_briged_head_atoms
7. n_HBD
8. n_amide_bonds
9. n_hetero_cycles
10. n_non_strict_rotable_bonds
11. n_aliphatic_carbocycles
12. n_saturated_carbocycles

----------------------------

Second param is 13
ROC AUC = 0.8790279223031565

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. fraction_CSP3
5. n_atoms_stereo_centers
6. n_briged_head_atoms
7. n_HBD
8. n_amide_bonds
9. n_hetero_cycles
10. n_non_strict_rotable_bonds

----------------------------

Second param is 14
ROC AUC = 0.8728573080290042

Features

1. n_HOH
2. n_alipha

ROC AUC = 0.8850566049130372

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_aromatic_carbocycles
7. n_non_strict_rotable_bonds
8. n_aliphatic_heterocycles
9. n_amide_bonds
10. n_saturated_carbocycles
11. fraction_CSP3
12. n_aliphatic_carbocycles

----------------------------

Second param is 3
ROC AUC = 0.8817840341781874

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_aromatic_carbocycles
7. n_saturated_carbocycles
8. n_amide_bonds
9. n_non_strict_rotable_bonds
10. n_aliphatic_heterocycles
11. fraction_CSP3
12. n_aliphatic_carbocycles

----------------------------

Second param is 4
ROC AUC = 0.8809262115356029

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_aromatic_carbocycles
7. n_saturated_carbocycles
8. n_amide_bonds
9. n_non_strict_rotable_bonds
10. n_aliphatic_heterocycles
11. fraction_CSP3

----------------------------

Second param is 5

ROC AUC = 0.8801745892382276

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_saturated_carbocycles
7. n_aromatic_carbocycles
8. n_amide_bonds
9. n_non_strict_rotable_bonds
10. n_aliphatic_heterocycles
11. fraction_CSP3
12. n_aliphatic_carbocycles

----------------------------

Second param is 7
ROC AUC = 0.880075382889021

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_saturated_carbocycles
7. n_aromatic_carbocycles
8. n_amide_bonds
9. n_non_strict_rotable_bonds
10. n_aliphatic_heterocycles
11. fraction_CSP3
12. n_aliphatic_carbocycles

----------------------------

Second param is 8
ROC AUC = 0.8801497876509258

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_saturated_carbocycles
7. n_aromatic_carbocycles
8. n_amide_bonds
9. fraction_CSP3
10. n_non_strict_rotable_bonds
11. n_aliphatic_heterocycles
12. n_aliphatic_carbocycles

--------------------

ROC AUC = 0.8791827193069389

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_aromatic_carbocycles
7. n_saturated_carbocycles
8. n_amide_bonds
9. n_non_strict_rotable_bonds
10. fraction_CSP3
11. n_aliphatic_heterocycles
12. n_aliphatic_carbocycles

----------------------------

Second param is 12
ROC AUC = 0.8791672473489915

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_aromatic_carbocycles
7. n_saturated_carbocycles
8. n_amide_bonds
9. n_non_strict_rotable_bonds
10. fraction_CSP3
11. n_aliphatic_heterocycles
12. n_aliphatic_carbocycles

----------------------------

Second param is 13
ROC AUC = 0.8787550532266323

Features

1. n_HBD
2. n_aliphatic_rings
3. n_O
4. n_hetero_cycles
5. n_briged_head_atoms
6. n_aromatic_carbocycles
7. n_saturated_carbocycles
8. n_amide_bonds
9. n_non_strict_rotable_bonds
10. fraction_CSP3
11. n_aliphatic_heterocycles
12. n_aliphatic_carbocycles

-----------------

'04:07:20'

# Comparison

In [31]:
g_rslts = for_TUKEY_TEST(g_data[0],y,g_data[1],g_data[2])

[0.87227183 0.89781746 0.91865079 0.90203373 0.90700605 0.90675403
 0.85197711 0.90166493 0.94120708 0.88865765]


In [32]:
print("forward partiton | normal way")
for fp,g  in zip (fp_rslts,g_rslts):
    print("{} | {}".format(fp,g))

forward partiton | normal way
0.876984126984127 | 0.8722718253968254
0.900049603174603 | 0.8978174603174603
0.873015873015873 | 0.9186507936507937
0.8854166666666667 | 0.9020337301587301
0.8878528225806451 | 0.9070060483870968
0.896421370967742 | 0.9067540322580645
0.8478147762747139 | 0.8519771071800208
0.8800728407908429 | 0.9016649323621229
0.9383454734651405 | 0.9412070759625389
0.8753902185223724 | 0.8886576482830384


# Selected features

In [33]:
# nested binary search
best_data[0].columns

Index(['n_HBD', 'n_aliphatic_rings', 'n_O', 'n_hetero_cycles',
       'n_briged_head_atoms', 'n_non_strict_rotable_bonds',
       'n_aliphatic_carbocycles', 'n_amide_bonds', 'fraction_CSP3'],
      dtype='object')

In [34]:
# normal way
g_data[0].columns

Index(['n_hetero_atoms', 'n_aliphatic_heterocycles', 'n_O', 'n_HBD',
       'n_non_strict_rotable_bonds', 'n_atoms_stereo_centers',
       'n_hetero_cycles', 'n_amide_bonds', 'n_briged_head_atoms',
       'n_aromatic_carbocycles', 'fraction_CSP3'],
      dtype='object')