# Basic imports

In [1]:
import pandas as pd
import numpy as np
import os
import operator
import statistics
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
current_path = os.getcwd()
root_path=current_path.replace('\\forward_feature_selection','')

# Loading dataframe

In [3]:
mixed_df=pd.read_csv(root_path+"\molecules.csv",sep="\t"); mixed_df.head()

Unnamed: 0,m_name,n_atoms_without_Hydrogen,n_atoms_with_Hydrogen,m_weight,m_avg_weigth,m_weigth_without_Hydrogen,n_radical_electrons,n_valence_electrons,n_aliphatic_carbocycles,n_aliphatic_heterocycles,...,n_Hydrogen_acceptors,n_Hydrogen_donnors,n_briged_head_atoms,n_atoms_stereo_centers,n_atoms_unspecified_stereo_centers,n_spiro_atoms,m_logp,m_mr,fraction_CSP3,is_cns_molecule
0,BUMETANIDE,25,45,364.109293,344.263,364.423,0,134,0,0,...,5,3,0,0,0,0,3.0365,94.6882,0.235294,0
1,BACLOFEN,14,26,213.055656,201.568,213.664,0,76,0,0,...,2,2,0,1,1,0,1.857,55.5002,0.3,1
2,METYRAPONE,17,31,226.110613,212.167,226.279,0,86,0,0,...,3,0,0,0,0,0,2.6371,65.8305,0.214286,0
3,METHYLPHENOBARBITAL,18,32,246.100442,232.154,246.266,0,94,0,1,...,3,1,0,1,1,0,1.0426,64.7197,0.307692,1
4,DULOXETINE,21,40,297.118735,278.271,297.423,0,108,0,0,...,3,1,0,1,1,0,4.6309,90.1797,0.222222,1


# Functions

In [4]:
def save_df_to_disk(df,name:str,separator="\t"):
    df.to_csv(name,sep=separator,index = False, header=True)

In [5]:
mixed_df[mixed_df.columns[-1]]

0      0
1      1
2      0
3      1
4      1
      ..
935    0
936    1
937    0
938    0
939    0
Name: is_cns_molecule, Length: 940, dtype: int64

In [6]:
def get_data_and_true_prediction(df,not_wanted_features:list):
        temp_df=df.drop(not_wanted_features,axis=1)
        y=temp_df[temp_df.columns[-1]]
        x=temp_df.drop([temp_df.columns[-1]],axis=1)
        
        return x,y   

In [7]:
unnecesary_features=["m_name"]
x,y=get_data_and_true_prediction(mixed_df,unnecesary_features)

# Forward selection

In [8]:
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC

In [9]:
def get_roc_auc_score(x,y,model): # gets roc auc average
        cv_results = cross_validate(model, x, y, cv=10,scoring=('roc_auc'))
        roc_auc_avrg=cv_results['test_score'].mean()
        
        return roc_auc_avrg 

In [10]:
clf = LinearSVC(random_state=0, tol=1e-5, dual=False) # model

In [11]:
score_lst=[]
for i in range(len(x.columns)):
    k=x.columns[i]
    temp_x=x[[k]]
    score=get_roc_auc_score(temp_x,y,clf)
    score_lst.append(score)

max_score = max(score_lst) # best score
max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score   

#### We can take any of the results, model will be the same

In [12]:
best_features=[x.columns[i] for i in max_score_index]
best_x_1=x[[best_features[0]]]
best_x_2=x[[best_features[1]]]
best_x=x[best_features]
new_x=x.drop(best_features,axis=1)

print(best_features)
print(get_roc_auc_score(best_x_1,y,clf))
print(get_roc_auc_score(best_x_2,y,clf))
print(get_roc_auc_score(best_x,y,clf))

['n_HBD', 'n_Hydrogen_donnors']
0.7445237527459821
0.7445237527459821
0.7445237527459821


In [13]:
ndf=pd.concat([best_x_1,best_x_2],axis=1); ndf

Unnamed: 0,n_HBD,n_Hydrogen_donnors
0,3,3
1,2,2
2,0,0
3,1,1
4,1,1
...,...,...
935,6,6
936,2,2
937,4,4
938,3,3


In [20]:
def forward_selection(x,y,model):
    
    def first_iteration(x,y,model):
        score_lst=[]
        for i in range(len(x.columns)):
            k=x.columns[i]
            temp_x=x[[k]]
            score=get_roc_auc_score(temp_x,y,model)
            score_lst.append(score)

        max_score = max(score_lst) # best score
        max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        new_features=[x.columns[i] for i in max_score_index]
        top_new_features=new_features[0]
        best_x=x[top_new_features]
        new_x=x.drop(top_new_features,axis=1)
        
        return best_x,new_x,max_score
    
    def else_iteration(best_x,x,y,model,actual_score):
        score_lst=[]
        for i in range(len(x.columns)):
            k=x.columns[i]
            temp_x=x[[k]]
            temp_new_x=pd.concat([best_x,temp_x],axis=1, ignore_index=True)
            score=get_roc_auc_score(temp_new_x,y,model)
            score_lst.append(score)
            
        max_score = max(score_lst) # best score        
        
        if(max_score<actual_score): return best_x,actual_score
        
        max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        
        new_features=[x.columns[i] for i in max_score_index]
        top_new_features=new_features[0]
        temp_x=x[top_new_features]
        best_x=pd.concat([best_x,temp_x],axis=1)
        new_x=x.drop(top_new_features,axis=1)
        
        return else_iteration(best_x,new_x,y,model,max_score)
    
    best_x,new_x,score=first_iteration(x,y,model)
    best_x,score=else_iteration(best_x,new_x,y,model,score)
    
    return best_x,score

In [21]:
mm=forward_selection(x,y,clf); mm

Unnamed: 0,n_HBD,n_O,n_aliphatic_heterocycles,m_logp,n_aromatic_rings,n_amide_bonds,n_briged_head_atoms,n_strict_rotable_bonds,n_saturated_rings,n_rings,n_HOH,n_non_strict_rotable_bonds,n_atoms_with_Hydrogen,n_primary_carbon_atoms,n_aromatic_heterocycles,n_saturated_heterocycles,m_avg_weigth,n_saturated_carbocycles,n_Hydrogen_donnors
0,3,7,0,3.0365,2,0,0,8,0,2,4,8,45,17,0,0,344.263,0,3
1,2,3,0,1.8570,1,0,0,4,0,1,3,4,26,10,0,0,201.568,0,2
2,0,3,0,2.6371,2,0,0,3,0,2,0,3,31,14,2,0,212.167,0,0
3,1,5,1,1.0426,1,4,0,2,1,2,1,2,32,13,0,1,232.154,0,1
4,1,2,0,4.6309,3,0,0,6,0,3,1,6,40,18,1,0,278.271,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,6,8,1,-1.7002,1,0,0,2,0,2,7,2,32,9,1,0,226.131,0,6
936,2,2,0,0.6924,1,0,0,3,0,1,3,3,22,8,0,0,124.102,0,2
937,4,4,0,1.1292,1,0,0,4,0,1,4,4,32,11,0,0,194.125,0,4
938,3,6,0,3.5072,3,0,0,6,0,3,3,8,46,20,1,0,389.224,0,3
