# Basic imports

In [1]:
import pandas as pd
import os
import time
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
current_path = os.getcwd()
root_path=current_path.replace('\\forward_feature_selection','')

# Loading dataframe

In [3]:
mixed_df=pd.read_csv(root_path+"\\molecules.csv",sep="\t")
f_classif_df=pd.read_csv(root_path+"\\f_classif\\f_classif_best.csv",sep="\t")
mic_df=pd.read_csv(root_path+"\\mutual_info_classif\\mic_best.csv",sep="\t")

# Functions

In [4]:
def save_df_to_disk(df,name:str,separator="\t"):
    df.to_csv(name,sep=separator,index = False, header=True)

In [5]:
def get_data_and_true_prediction(df,not_wanted_features:list):
        temp_df=df.drop(not_wanted_features,axis=1)
        y=temp_df[temp_df.columns[-1]]
        x=temp_df.drop([temp_df.columns[-1]],axis=1)
        
        return x,y   

In [6]:
def get_df_with_name_and_prediction(df,true_prediction,big_df):
    new_df=df
    new_df.insert(0,"m_name",big_df["m_name"].values)
    new_df=new_df.join(true_prediction)
    
    return new_df

In [7]:
def save_best_model(df_lst:list,score_lst:list,switch:dict,true_prediction,big_df,new_df_name:str):

    best_score = max(score_lst) # best score
    best_score_index=[i for i, j in enumerate(score_lst) if j == best_score] # indx with best score 
    top_score_index=best_score_index[0]

    print(switch[top_score_index]+" is the best")
    print("score: {}%".format(score_lst[top_score_index]*100))
    best_df = get_df_with_name_and_prediction(df_lst[top_score_index],true_prediction,big_df)
    
    print("\nFEATURES\n")
    try:
        for c,i in enumerate(best_df.columns):
            print("{}. {}".format(c+1,i))
        save_df_to_disk(best_df,new_df_name)
    except:
        print("Data frame doesn't exist")

# Data

In [8]:
unnecesary_features=["m_name"]
x,y=get_data_and_true_prediction(mixed_df,unnecesary_features)
f_classif_x,f_classif_y=get_data_and_true_prediction(f_classif_df,unnecesary_features)
mic_x,mic_y=get_data_and_true_prediction(mic_df,unnecesary_features)

In [9]:
df_types={0:'normal',1:'f_classif',2:'mic'}

# Forward selection

In [10]:
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC

In [11]:
def get_roc_auc_score(x,y,model): # gets roc auc average
        cv_results = cross_validate(model, x, y, cv=10,scoring=('roc_auc'))
        roc_auc_avrg=cv_results['test_score'].mean()
        
        return roc_auc_avrg 

In [12]:
clf = LinearSVC(random_state=0, tol=1e-5, dual=False) # linear model

#### We can take any of the results, model will be the same

In [13]:
def forward_selection(x,y,model): # O(n) worst case scenario, where n depends on len(x.columns)
    
    def first_iteration(x,y,model):
        score_lst=[]
        for i in range(len(x.columns)):
            k=x.columns[i]
            temp_x=x[[k]]
            score=get_roc_auc_score(temp_x,y,model)
            score_lst.append(score)

        max_score = max(score_lst) # best score
        max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        new_features=[x.columns[i] for i in max_score_index]
        top_new_features=new_features[0]
        best_x=x[top_new_features]
        new_x=x.drop(top_new_features,axis=1)
        
        return best_x,new_x,max_score
    
    def else_iteration(best_x,x,y,model,actual_score):     
        new_x_lenght = len(x.columns)
        if (new_x_lenght > 0):
            score_lst=[]
            for i in range(new_x_lenght):
                k=x.columns[i]
                temp_x=x[[k]]
                temp_new_x=pd.concat([best_x,temp_x],axis=1, ignore_index=True)
                score=get_roc_auc_score(temp_new_x,y,model)
                score_lst.append(score)

            new_max_score = max(score_lst) # best score
            actual_best_score = actual_score # score passed from parameters

            if(new_max_score<actual_best_score):
                return best_x,actual_best_score # break condition, recursive function

            max_score_index=[i for i, j in enumerate(score_lst) if j == new_max_score] # indx with best score 

            new_features=[x.columns[i] for i in max_score_index]
            top_new_features=new_features[0]
            temp_x=x[top_new_features]
            best_x=pd.concat([best_x,temp_x],axis=1)
            new_x=x.drop(top_new_features,axis=1)

            return else_iteration(best_x,new_x,y,model,new_max_score)
        
        return best_x,actual_score
    
    best_x,new_x,score=first_iteration(x,y,model)
    best_x,score=else_iteration(best_x,new_x,y,model,score)
    
    return best_x,score

# Comparison linear

# Normal

In [14]:
start = time.time()
df_1,score_normal=forward_selection(x,y,clf)
end = time.time()

time.strftime('%H:%M:%S', time.gmtime(end-start))

'00:00:24'

# F classif

In [15]:
start = time.time()
df_2,score_f_classif=forward_selection(f_classif_x,f_classif_y,clf)
end = time.time()

time.strftime('%H:%M:%S', time.gmtime(end-start))

'00:00:11'

# MIC

In [16]:
start = time.time()
df_3,score_mic=forward_selection(mic_x,mic_y,clf)
end = time.time()

time.strftime('%H:%M:%S', time.gmtime(end-start))

'00:00:17'

# Best linear

In [17]:
save_best_model([df_1,df_2,df_3],[score_normal,score_f_classif,score_mic],df_types,y,mixed_df,"best_classifier_linear.csv")

normal is the best
score: 89.04773002246337%

FEATURES

1. m_name
2. n_HBD
3. n_O
4. n_aliphatic_heterocycles
5. m_logp
6. n_aromatic_rings
7. n_amide_bonds
8. n_briged_head_atoms
9. n_strict_rotable_bonds
10. n_saturated_rings
11. n_rings
12. n_HOH
13. n_non_strict_rotable_bonds
14. n_atoms_with_Hydrogen
15. n_primary_carbon_atoms
16. n_aromatic_heterocycles
17. n_saturated_heterocycles
18. m_avg_weigth
19. n_saturated_carbocycles
20. n_aliphatic_carbocycles
21. is_cns_molecule


# KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier

In [19]:
def KNN_forward_selection_K_and_P_changes(x,y,k:list,pw:list,mt,alfa=6):
    
    def get_best_score_index(score_lst):
        max_score = max(score_lst) # best score
        max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        
        return max_score_index[0]
    
    def inner_iteration(x,y,k,pw:list,mt):
        print("k is now: {}".format(k))
        df_score_lst = []
        for i in pw:
            KNB_clf = KNeighborsClassifier(n_neighbors=k,p=i,metric=mt) # KNN model
            df,score = forward_selection(x,y,KNB_clf)
            df_score_lst.append([df,score])
            
        score_lst = [y for [x,y] in df_score_lst]
        best_index = get_best_score_index(score_lst)
        print("Best inner model when p = {}".format(pw[best_index]))
        return df_score_lst[best_index][0],df_score_lst[best_index][1],pw[best_index]
        
    print(f"Metric: {mt}\n")
    
    outer_df_score_lst = []
    temp_best_score = 0
    
    tries = int((len(k)/alfa)) # spare tries if by some point the performance decreases
    
    for cnt,i in enumerate(k):
        best_inner_df,best_inner_score,best_p_value = inner_iteration(x,y,i,pw,mt)
        print("Spare tries: {}".format(tries))
        
        if((best_inner_score > temp_best_score) and tries > 0):
            temp_best_score = best_inner_score
            outer_df_score_lst.append([best_inner_df,best_inner_score,i,best_p_value])
            if (cnt > 0): print("This iteration had an improvement\n")
            elif (cnt == 0): print("")             
            tries = int((len(k)/alfa))               
        else:
            print("This iteration didn't have an improvement\n")
            tries = tries-1
        
        if (tries == 0):
            break
                                              
    outer_score_lst = [b for [a,b,c,d] in outer_df_score_lst]
    best_outer_index = get_best_score_index(outer_score_lst)
    
    print("Final results")
    print("Best model when k = {} ,p = {} ,roc_auc = {}%".format(outer_df_score_lst[best_outer_index][2],
                                                                 outer_df_score_lst[best_outer_index][3],
                                                                 outer_df_score_lst[best_outer_index][1]*100))
    
    return outer_df_score_lst[best_outer_index][0],outer_df_score_lst[best_outer_index][1]   

# Comparison KNN

In [22]:
max_k = 50
max_p = 10

k_lst = [i for i in range(1,max_k+1)]
p_lst = [i for i in range(1,max_p+1)]

p = 1, Manhattan Distance
p = 2, Euclidean Distance
p = ∞, Chebychev Distance

## Normal

In [23]:
start = time.time()
df_1,score_normal=KNN_forward_selection_K_and_P_changes(x,y,k_lst,p_lst,'minkowski')
end = time.time()

time.strftime('%H:%M:%S', time.gmtime(end-start))

Metric: minkowski

k is now: 1
Best inner model when p = 3
Spare tries: 8

k is now: 2
Best inner model when p = 1
Spare tries: 8
This iteration had an improvement

k is now: 3
Best inner model when p = 3
Spare tries: 8
This iteration had an improvement

k is now: 4
Best inner model when p = 1
Spare tries: 8
This iteration didn't have an improvement

k is now: 5
Best inner model when p = 3
Spare tries: 7
This iteration had an improvement

k is now: 6
Best inner model when p = 4
Spare tries: 8
This iteration didn't have an improvement

k is now: 7
Best inner model when p = 2
Spare tries: 7
This iteration didn't have an improvement

k is now: 8
Best inner model when p = 6
Spare tries: 6
This iteration didn't have an improvement

k is now: 9
Best inner model when p = 10
Spare tries: 5
This iteration didn't have an improvement

k is now: 10
Best inner model when p = 9
Spare tries: 4
This iteration had an improvement

k is now: 11
Best inner model when p = 3
Spare tries: 8
This iteration di

'01:06:40'

## F_classif

In [24]:
start = time.time()
df_2,score_f_classif=KNN_forward_selection_K_and_P_changes(f_classif_x,f_classif_y,k_lst,p_lst,'minkowski')
end = time.time()

time.strftime('%H:%M:%S', time.gmtime(end-start))

Metric: minkowski

k is now: 1
Best inner model when p = 1
Spare tries: 8

k is now: 2
Best inner model when p = 1
Spare tries: 8
This iteration had an improvement

k is now: 3
Best inner model when p = 1
Spare tries: 8
This iteration had an improvement

k is now: 4
Best inner model when p = 7
Spare tries: 8
This iteration had an improvement

k is now: 5
Best inner model when p = 3
Spare tries: 8
This iteration had an improvement

k is now: 6
Best inner model when p = 1
Spare tries: 8
This iteration didn't have an improvement

k is now: 7
Best inner model when p = 4
Spare tries: 7
This iteration had an improvement

k is now: 8
Best inner model when p = 10
Spare tries: 8
This iteration had an improvement

k is now: 9
Best inner model when p = 1
Spare tries: 8
This iteration didn't have an improvement

k is now: 10
Best inner model when p = 5
Spare tries: 7
This iteration had an improvement

k is now: 11
Best inner model when p = 4
Spare tries: 8
This iteration didn't have an improvement

'00:35:49'

## MIC

In [25]:
start = time.time()
df_3,score_mic=KNN_forward_selection_K_and_P_changes(mic_x,mic_y,k_lst,p_lst,'minkowski')
end = time.time()

time.strftime('%H:%M:%S', time.gmtime(end-start))

Metric: minkowski

k is now: 1
Best inner model when p = 1
Spare tries: 8

k is now: 2
Best inner model when p = 1
Spare tries: 8
This iteration had an improvement

k is now: 3
Best inner model when p = 2
Spare tries: 8
This iteration had an improvement

k is now: 4
Best inner model when p = 1
Spare tries: 8
This iteration had an improvement

k is now: 5
Best inner model when p = 8
Spare tries: 8
This iteration didn't have an improvement

k is now: 6
Best inner model when p = 10
Spare tries: 7
This iteration didn't have an improvement

k is now: 7
Best inner model when p = 4
Spare tries: 6
This iteration had an improvement

k is now: 8
Best inner model when p = 7
Spare tries: 8
This iteration had an improvement

k is now: 9
Best inner model when p = 5
Spare tries: 8
This iteration had an improvement

k is now: 10
Best inner model when p = 5
Spare tries: 8
This iteration had an improvement

k is now: 11
Best inner model when p = 2
Spare tries: 8
This iteration didn't have an improvement

'00:48:01'

# Best KNN “minkowski”

In [26]:
save_best_model([df_1,df_2,df_3],[score_normal,score_f_classif,score_mic],df_types,y,mixed_df,"best_classifier_knn_minkowski.csv")

normal is the best
score: 89.87858448747173%

FEATURES

1. m_name
2. n_hetero_atoms
3. n_aliphatic_heterocycles
4. n_O
5. n_HBD
6. n_non_strict_rotable_bonds
7. n_atoms_stereo_centers
8. n_hetero_cycles
9. n_amide_bonds
10. n_briged_head_atoms
11. n_aromatic_carbocycles
12. fraction_CSP3
13. is_cns_molecule


k=10, p=9 for this knn model