# Basic imports

In [22]:
import pandas as pd
import os
import time
import math
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams.update({'figure.max_open_warning': 0})

In [23]:
current_path = os.getcwd()
root_path=current_path.replace('\\forward_feature_selection','')

In [27]:
mixed_df=pd.read_csv(root_path+"\\molecules.csv",sep="\t")

# Functions

In [28]:
def save_df_to_disk(df,name:str,separator="\t"):
    df.to_csv(name,sep=separator,index = False, header=True)

In [29]:
def get_data_and_true_prediction(df,not_wanted_features:list):
        temp_df=df.drop(not_wanted_features,axis=1)
        y=temp_df[temp_df.columns[-1]]
        x=temp_df.drop([temp_df.columns[-1]],axis=1)
        
        return x,y   

In [30]:
def get_df_with_name_and_prediction(df,true_prediction,big_df):
    new_df=df
    new_df.insert(0,"m_name",big_df["m_name"].values)
    new_df=new_df.join(true_prediction)
    
    return new_df

# Forward selection

In [31]:
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier

In [32]:
def get_roc_auc_score(x,y,model): # gets roc auc average
        cv_results = cross_validate(model, x, y, cv=10,scoring=('roc_auc'))
        roc_auc_avrg=cv_results['test_score'].mean()
        
        return roc_auc_avrg 

In [33]:
def forward_selection(x,y,model): # O(n) worst case scenario, where n depends on len(x.columns)
    
    def first_iteration(x,y,model):
        score_lst=[]
        for i in range(len(x.columns)):
            k=x.columns[i]
            temp_x=x[[k]]
            score=get_roc_auc_score(temp_x,y,model)
            score_lst.append(score)

        max_score = max(score_lst) # best score
        max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        new_features=[x.columns[i] for i in max_score_index]
        top_new_features=new_features[0]
        best_x=x[top_new_features]
        new_x=x.drop(top_new_features,axis=1)
        
        return best_x,new_x,max_score
    
    def else_iteration(best_x,x,y,model,actual_score):     
        new_x_lenght = len(x.columns)
        if (new_x_lenght > 0):
            score_lst=[]
            for i in range(new_x_lenght):
                k=x.columns[i]
                temp_x=x[[k]]
                temp_new_x=pd.concat([best_x,temp_x],axis=1, ignore_index=True)
                score=get_roc_auc_score(temp_new_x,y,model)
                score_lst.append(score)

            new_max_score = max(score_lst) # best score
            actual_best_score = actual_score # score passed from parameters

            if(new_max_score<actual_best_score):
                return best_x,actual_best_score # break condition, recursive function

            max_score_index=[i for i, j in enumerate(score_lst) if j == new_max_score] # indx with best score 

            new_features=[x.columns[i] for i in max_score_index]
            top_new_features=new_features[0]
            temp_x=x[top_new_features]
            best_x=pd.concat([best_x,temp_x],axis=1)
            new_x=x.drop(top_new_features,axis=1)

            return else_iteration(best_x,new_x,y,model,new_max_score)
        
        return best_x,actual_score
    
    best_x,new_x,score=first_iteration(x,y,model)
    best_x,score=else_iteration(best_x,new_x,y,model,score)
    
    return best_x,score

In [34]:
def forward_partition_KNN(x,y,lst:list,other,mt='minkowski'): # list must be greater than len 2    
    print("\nfirst param list len is {}".format(len(lst)))
    print("first param list -> {}".format(lst))
    
    if (len(lst) == 2):
        first = lst[0]
        print("first is {}".format(first))
        last = lst[1]
        print("last is {}".format(last))
        clf_first = KNeighborsClassifier(n_neighbors=first,p=other,metric=mt)
        first_df,first_score = forward_selection(x,y,clf_first)
        clf_last = KNeighborsClassifier(n_neighbors=last,p=other,metric=mt)
        last_df,last_score = forward_selection(x,y,clf_last)
        
        if (last_score > first_score):
            print("last score is greater than first score")
            return last_df,last,last_score
        
        print("first score is greater than last score")
        return first_df,first,first_score
    
    elif(len(lst) == 3):
        first = lst[0]
        print("first is {}".format(first))
        middle = lst[1]
        print("middle is {}".format(middle))
        last = lst[-1]
        print("last is {}".format(last))
        clf_first = KNeighborsClassifier(n_neighbors=first,p=other,metric=mt)
        _,first_score = forward_selection(x,y,clf_first)       
        clf_middle = KNeighborsClassifier(n_neighbors=middle,p=other,metric=mt)
        _,middle_score = forward_selection(x,y,clf_middle)
        clf_last = KNeighborsClassifier(n_neighbors=last,p=other,metric=mt)
        _,last_score = forward_selection(x,y,clf_last)
        
        if ((first_score > middle_score) and (middle_score > last_score)):
            print("sub list taken: left")
            return forward_partition_KNN(x,y,lst[:2],other)  
        
        print("sub list taken: right")            
        return forward_partition_KNN(x,y,lst[1:],other)
    
    else:          
        first = lst[0]
        print("first is {}".format(first))
        middle = lst[(len(lst)//2)-1]
        print("middle is {}".format(middle))
        last = lst[-1]
        print("last is {}".format(last))
        clf_first = KNeighborsClassifier(n_neighbors=first,p=other,metric=mt)
        _,first_score = forward_selection(x,y,clf_first)       
        clf_middle = KNeighborsClassifier(n_neighbors=middle,p=other,metric=mt)
        _,middle_score = forward_selection(x,y,clf_middle)
        clf_last = KNeighborsClassifier(n_neighbors=last,p=other,metric=mt)
        _,last_score = forward_selection(x,y,clf_last)
        
        if ((first_score > middle_score) and (middle_score > last_score)):
            print("sub list taken: left")
            return forward_partition_KNN(x,y,lst[:(len(lst)//2)],other)
        
        print("sub list taken: right")            
        return forward_partition_KNN(x,y,lst[(len(lst)//2)-1:],other)    

In [35]:
def best_KNN(x,y,K:list,P:list):
    
    def get_best_score_index(score_lst:list):
        max_score = max(score_lst) # best score
        max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        
        return max_score_index[0]
    
    best_prm_lst = []
    for i in P:
        print("Seccond parameter is: {}".format(i))
        best_df,best_k,best_score = forward_partition_KNN(x,y,K,i)      
        print("\nBest first parameter is: {}".format(best_k))
        print("ROC AUC = {}".format(best_score))
        print("\nFeatures\n")
        for c,i in enumerate(best_df.columns):
            print("{}. {}".format(c+1,i))
        best_prm_lst.append([best_df,best_k,i,best_score])
        print("")
        
    score_lst = [d  for [a,b,c,d] in best_prm_lst]
    index = get_best_score_index(score_lst)
    print("Final results")
    print("Best model when first param = {} ,second param = {} ,ROC AUC = {}".format(best_prm_lst[index][1],
                                                                 best_prm_lst[index][2],
                                                                 best_prm_lst[index][3]))
    return best_prm_lst[index][0]

In [48]:
def guarantee_results(x,y,K:list,P:list,mt='minkowski'):
    
    def get_best_score_index(score_lst:list):
        max_score = max(score_lst) # best score
        max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        
        return max_score_index[0]
    
    best_outer_prm_lst = []
    for i in K:
        print("First param is {}\n".format(i))
        best_inner_prm_lst = []
        for j in P:
            print("Second param is {}".format(j))
            clf = KNeighborsClassifier(n_neighbors=i,p=j,metric=mt)
            best_df,best_score = forward_selection(x,y,clf)
            print("ROC AUC = {}".format(best_score))
            print("\nFeatures\n")
            for c,d in enumerate(best_df.columns):
                print("{}. {}".format(c+1,d))       
            print("")
            best_inner_prm_lst.append([best_df,i,j,best_score])
            
        score_lst = [d  for [a,b,c,d] in best_inner_prm_lst]
        index = get_best_score_index(score_lst)
        best_outer_prm_lst.append([best_inner_prm_lst[index][0],best_inner_prm_lst[index][1],best_inner_prm_lst[index][2],best_inner_prm_lst[index][3]])
        
    score_lst = [d  for [a,b,c,d] in best_outer_prm_lst]
    index = get_best_score_index(score_lst)
    print("Final results")
    print("Best model when first param = {} ,second param = {} ,ROC AUC = {}".format(best_outer_prm_lst[index][1],
                                                                 best_outer_prm_lst[index][2],
                                                                 best_outer_prm_lst[index][3]))
    return best_outer_prm_lst[index][0]

# Data

In [37]:
unnecesary_features=["m_name"]
x,y=get_data_and_true_prediction(mixed_df,unnecesary_features)

In [38]:
max_k = 20
max_p = 10

K = [i for i in range(1,max_k+1)]
P = [i for i in range(1,max_p+1)]

p = 1, Manhattan Distance
p = 2, Euclidean Distance
p = ∞, Chebychev Distance

In [14]:
start = time.time()
best_df = best_KNN(x,y,K,P)
end = time.time()

time.strftime('%H:%M:%S', time.gmtime(end-start))

Seccond parameter is: 1

first param list len is 20
first param list -> [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
first is 1
middle is 10
last is 20
sub list taken: right

first param list len is 11
first param list -> [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
first is 10
middle is 14
last is 20
sub list taken: left

first param list len is 5
first param list -> [10, 11, 12, 13, 14]
first is 10
middle is 11
last is 14
sub list taken: right

first param list len is 4
first param list -> [11, 12, 13, 14]
first is 11
middle is 12
last is 14
sub list taken: right

first param list len is 3
first param list -> [12, 13, 14]
first is 12
middle is 13
last is 14
sub list taken: right

first param list len is 2
first param list -> [13, 14]
first is 13
last is 14
last score is greater than first score

Best first parameter is: 14
ROC AUC = 0.8859599014336919

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. n_saturated_carbocycles
5. n_briged_head_atoms
6.

sub list taken: right

first param list len is 3
first param list -> [12, 13, 14]
first is 12
middle is 13
last is 14
sub list taken: right

first param list len is 2
first param list -> [13, 14]
first is 13
last is 14
first score is greater than last score

Best first parameter is: 13
ROC AUC = 0.8932354380357763

Features

1. n_HBD
2. n_hetero_atoms
3. n_aliphatic_heterocycles
4. n_non_strict_rotable_bonds
5. n_amide_bonds
6. n_atoms_stereo_centers
7. n_briged_head_atoms
8. n_aromatic_carbocycles
9. n_hetero_cycles
10. n_O
11. fraction_CSP3

Seccond parameter is: 9

first param list len is 20
first param list -> [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
first is 1
middle is 10
last is 20
sub list taken: right

first param list len is 11
first param list -> [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
first is 10
middle is 14
last is 20
sub list taken: left

first param list len is 5
first param list -> [10, 11, 12, 13, 14]
first is 10
middle is 11
last i

'00:56:05'

In [46]:
k_pivot = 13
p_pivot = 10
new_K = [i for i in range(k_pivot-3,k_pivot+4)]
new_P = [i for i in range(p_pivot-3,p_pivot+4)]

In [49]:
start = time.time()
g_df = guarantee_results(x,y,new_K,new_P)
end = time.time()

time.strftime('%H:%M:%S', time.gmtime(end-start))

First param is 10

Second param is 7
ROC AUC = 0.8985329899823267

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. n_HBD
5. n_non_strict_rotable_bonds
6. n_atoms_stereo_centers
7. n_hetero_cycles
8. n_amide_bonds
9. n_briged_head_atoms
10. n_aromatic_carbocycles
11. fraction_CSP3

Second param is 8
ROC AUC = 0.8986866385255109

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. n_HBD
5. n_non_strict_rotable_bonds
6. n_atoms_stereo_centers
7. n_hetero_cycles
8. n_amide_bonds
9. n_briged_head_atoms
10. n_aromatic_carbocycles
11. fraction_CSP3

Second param is 9
ROC AUC = 0.8987858448747172

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. n_HBD
5. n_non_strict_rotable_bonds
6. n_atoms_stereo_centers
7. n_hetero_cycles
8. n_amide_bonds
9. n_briged_head_atoms
10. n_aromatic_carbocycles
11. fraction_CSP3

Second param is 10
ROC AUC = 0.8987858448747172

Features

1. n_hetero_atoms
2. n_aliphatic_heterocycles
3. n_O
4. n_HBD
5. n_non_strict_r

ROC AUC = 0.8849615020522604

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. n_saturated_carbocycles
5. n_briged_head_atoms
6. n_HBA
7. n_HBD
8. n_amide_bonds
9. n_aromatic_carbocycles
10. n_aliphatic_rings
11. n_hetero_cycles
12. n_aromatic_rings

Second param is 11
ROC AUC = 0.8849615020522604

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. n_saturated_carbocycles
5. n_briged_head_atoms
6. n_HBA
7. n_HBD
8. n_amide_bonds
9. n_aromatic_carbocycles
10. n_aliphatic_rings
11. n_hetero_cycles
12. n_aromatic_rings

Second param is 12
ROC AUC = 0.8849615020522604

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. n_saturated_carbocycles
5. n_briged_head_atoms
6. n_HBA
7. n_HBD
8. n_amide_bonds
9. n_aromatic_carbocycles
10. n_aliphatic_rings
11. n_hetero_cycles
12. n_aromatic_rings

Second param is 13
ROC AUC = 0.8849615020522604

Features

1. n_HOH
2. n_aliphatic_heterocycles
3. n_O
4. n_saturated_carbocycles
5. n_briged_head_atoms
6. n_HBA
7. n_HBD
8. n_amide_bon

'00:18:15'

In [50]:
new_df = get_df_with_name_and_prediction(g_df,y,mixed_df)
save_df_to_disk(new_df,"best_classifier_KNN_minkowski.csv")

not guarantee

K = 13
P = 10
ROC AUC = 0.8935

guarantee

K = 10
P = 9
ROC AUC = 0.8987

In [51]:
new_df.head()

Unnamed: 0,m_name,n_hetero_atoms,n_aliphatic_heterocycles,n_O,n_HBD,n_non_strict_rotable_bonds,n_atoms_stereo_centers,n_hetero_cycles,n_amide_bonds,n_briged_head_atoms,n_aromatic_carbocycles,fraction_CSP3,is_cns_molecule
0,BUMETANIDE,8,0,7,3,8,0,0,0,0,2,0.235294,0
1,BACLOFEN,4,0,3,2,4,1,0,0,0,1,0.3,1
2,METYRAPONE,3,0,3,0,3,0,2,0,0,0,0.214286,0
3,METHYLPHENOBARBITAL,5,1,5,1,2,1,1,4,0,1,0.307692,1
4,DULOXETINE,3,0,2,1,6,1,1,0,0,2,0.222222,1
