# Basic imports

In [1]:
import pandas as pd
import numpy as np
import os
import operator
import statistics
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
current_path = os.getcwd()
root_path=current_path.replace('\\forward_feature_selection','')

# Loading dataframe

In [3]:
mixed_df=pd.read_csv(root_path+"\\molecules.csv",sep="\t"); mixed_df.head()
f_classif_df=pd.read_csv(root_path+"\\f_classif\\f_classif_best.csv",sep="\t")
mic_df=pd.read_csv(root_path+"\\mutual_info_classif\\mic_best.csv",sep="\t")

# Functions

In [4]:
def save_df_to_disk(df,name:str,separator="\t"):
    df.to_csv(name,sep=separator,index = False, header=True)

In [5]:
def get_data_and_true_prediction(df,not_wanted_features:list):
        temp_df=df.drop(not_wanted_features,axis=1)
        y=temp_df[temp_df.columns[-1]]
        x=temp_df.drop([temp_df.columns[-1]],axis=1)
        
        return x,y   

In [6]:
def get_df_with_name_and_prediction(df,true_prediction,big_df):
    new_df=df
    new_df.insert(0,"m_name",big_df["m_name"].values)
    new_df=new_df.join(true_prediction)
    
    return new_df

# Data

In [7]:
unnecesary_features=["m_name"]
x,y=get_data_and_true_prediction(mixed_df,unnecesary_features)
f_classif_x,f_classif_y=get_data_and_true_prediction(f_classif_df,unnecesary_features)
mic_x,mic_y=get_data_and_true_prediction(mic_df,unnecesary_features)

# Forward selection

In [8]:
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC

In [9]:
def get_roc_auc_score(x,y,model): # gets roc auc average
        cv_results = cross_validate(model, x, y, cv=10,scoring=('roc_auc'))
        roc_auc_avrg=cv_results['test_score'].mean()
        
        return roc_auc_avrg 

In [10]:
clf = LinearSVC(random_state=0, tol=1e-5, dual=False) # model

#### We can take any of the results, model will be the same

In [11]:
def forward_selection(x,y,model):
    
    def first_iteration(x,y,model):
        score_lst=[]
        for i in range(len(x.columns)):
            k=x.columns[i]
            temp_x=x[[k]]
            score=get_roc_auc_score(temp_x,y,model)
            score_lst.append(score)

        max_score = max(score_lst) # best score
        max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        new_features=[x.columns[i] for i in max_score_index]
        top_new_features=new_features[0]
        best_x=x[top_new_features]
        new_x=x.drop(top_new_features,axis=1)
        
        return best_x,new_x,max_score
    
    def else_iteration(best_x,x,y,model,actual_score):
        score_lst=[]
        for i in range(len(x.columns)):
            k=x.columns[i]
            temp_x=x[[k]]
            temp_new_x=pd.concat([best_x,temp_x],axis=1, ignore_index=True)
            score=get_roc_auc_score(temp_new_x,y,model)
            score_lst.append(score)
            
        max_score = max(score_lst) # best score        
        
        if(max_score<actual_score): return best_x,actual_score
        
        max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        
        new_features=[x.columns[i] for i in max_score_index]
        top_new_features=new_features[0]
        temp_x=x[top_new_features]
        best_x=pd.concat([best_x,temp_x],axis=1)
        new_x=x.drop(top_new_features,axis=1)
        
        return else_iteration(best_x,new_x,y,model,max_score)
    
    best_x,new_x,score=first_iteration(x,y,model)
    best_x,score=else_iteration(best_x,new_x,y,model,score)
    
    return best_x,score

# Comparison linear

# Normal

In [12]:
df_1,score=forward_selection(x,y,clf); df_1.shape

(940, 19)

In [13]:
print("Model score: {}%".format(score*100)) #Model score: 89.057650657384%

Model score: 89.05732547528203%


# F classif

In [14]:
df_2,score=forward_selection(f_classif_x,f_classif_y,clf); df_2.shape

(940, 12)

In [15]:
print("Model score: {}%".format(score*100)) 

Model score: 88.00086044216506%


# MIC

In [16]:
df_3,score=forward_selection(mic_x,mic_y,clf); df_3.shape

(940, 19)

In [17]:
print("Model score: {}%".format(score*100)) 

Model score: 88.9372971483078%


### MIC and Normal seem the same, lest see their differences

In [18]:
normal_best_columns=df_1.columns.values
mic_best_columns=df_3.columns.values

In [19]:
diff_1= list(set(normal_best_columns)-set(mic_best_columns))
diff_2= list(set(mic_best_columns)-set(normal_best_columns))
same=list(set(normal_best_columns)&set(mic_best_columns))

In [20]:
print("Normal has the next features, but mic_best doesn't {}".format(diff_1))

Normal has the next features, but mic_best doesn't ['n_primary_carbon_atoms', 'n_aromatic_heterocycles', 'm_avg_weigth', 'n_saturated_carbocycles']


In [21]:
print("mic_best has the next features, but normal doesn't {}".format(diff_2))

mic_best has the next features, but normal doesn't ['n_atoms_without_Hydrogen', 'n_aromatic_carbocycles', 'n_valence_electrons', 'n_radical_electrons']


In [22]:
print("Normal and mic_best have in common: {}".format(same))

Normal and mic_best have in common: ['n_Hydrogen_donnors', 'n_strict_rotable_bonds', 'n_saturated_heterocycles', 'n_HOH', 'n_atoms_with_Hydrogen', 'n_aromatic_rings', 'n_HBD', 'n_amide_bonds', 'n_saturated_rings', 'n_briged_head_atoms', 'n_non_strict_rotable_bonds', 'n_aliphatic_heterocycles', 'n_O', 'n_rings', 'm_logp']


# Best linear

In [23]:
best_df = get_df_with_name_and_prediction(df_1,y,mixed_df)

In [24]:
save_df_to_disk(best_df,"best_df_linear.csv")

# KNN

In [25]:
from sklearn.neighbors import KNeighborsClassifier

The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric.

In [26]:
def KNN_forward_selection(x,y,k,pw,mt):
    
    def get_best_score_index(score_lst):
        max_score = max(score_lst) # best score
        max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        top_score_index=max_score_index[0]
        
        return top_score_index
    
    df_score_lst=[]
    for i in range(1,k+1):
        KNB_clf=KNeighborsClassifier(n_neighbors=i,p=pw,metric=mt)
        df,score = forward_selection(x,y,KNB_clf)
        df_score_lst.append([df,score])
        
    score_lst=[y for [x,y] in df_score_lst]
    index=get_best_score_index(score_lst)
    
    return df_score_lst[index][0],df_score_lst[index][1]   

# Comparison KNN

## Normal

In [27]:
best_df_KNN_normal,score=KNN_forward_selection(x,y,10,2,'minkowski')

In [28]:
print(best_df_KNN_normal.shape)
print("ROC AUC score: {}%".format(score*100))

(940, 14)
ROC AUC score: 88.94066381538741%


## F_classif

In [29]:
best_df_KNN_f_classif,score=KNN_forward_selection(f_classif_x,f_classif_y,10,2,'minkowski')

In [30]:
print(best_df_KNN_f_classif.shape)
print("ROC AUC score: {}%".format(score*100))

(940, 10)
ROC AUC score: 87.92740025684225%


## MIC

In [31]:
best_df_KNN_mic,score=KNN_forward_selection(mic_x,mic_y,10,2,'minkowski')

In [32]:
print(best_df_KNN_mic.shape)
print("ROC AUC score: {}%".format(score*100))

(940, 10)
ROC AUC score: 89.26791185603619%


# Best KNN

In [34]:
best_df = get_df_with_name_and_prediction(best_df_KNN_mic,y,mic_df)

In [35]:
save_df_to_disk(best_df,"best_df_KNN.csv")

In [36]:
best_df.head()

Unnamed: 0,m_name,n_hetero_atoms,n_aliphatic_heterocycles,n_O,n_HBD,n_non_strict_rotable_bonds,n_atoms_stereo_centers,n_amide_bonds,n_briged_head_atoms,n_aromatic_carbocycles,n_radical_electrons,is_cns_molecule
0,BUMETANIDE,8.0,0.0,7.0,3.0,8.0,0.0,0.0,0.0,2.0,0.0,0
1,BACLOFEN,4.0,0.0,3.0,2.0,4.0,1.0,0.0,0.0,1.0,0.0,1
2,METYRAPONE,3.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0
3,METHYLPHENOBARBITAL,5.0,1.0,5.0,1.0,2.0,1.0,4.0,0.0,1.0,0.0,1
4,DULOXETINE,3.0,0.0,2.0,1.0,6.0,1.0,0.0,0.0,2.0,0.0,1
