# Basic imports

In [1]:
import pandas as pd
import os
import time
import math
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
current_path = os.getcwd()
root_path=current_path.replace('\\forward_feature_selection','')

# Loading dataframe

In [3]:
mixed_df=pd.read_csv(root_path+"\\molecules.csv",sep="\t")

# Functions

In [4]:
def save_df_to_disk(df,name:str,separator="\t"):
    df.to_csv(name,sep=separator,index = False, header=True)

In [5]:
def get_data_and_true_prediction(df,not_wanted_features:list):
        temp_df=df.drop(not_wanted_features,axis=1)
        y=temp_df[temp_df.columns[-1]]
        x=temp_df.drop([temp_df.columns[-1]],axis=1)
        
        return x,y   

# Forward selection

In [7]:
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC

In [8]:
def get_roc_auc_score(x,y,model): # gets roc auc average
        cv_results = cross_validate(model, x, y, cv=10,scoring=('roc_auc'))
        roc_auc_avrg=cv_results['test_score'].mean()
        
        return roc_auc_avrg 

In [9]:
def get_best_score_index(score_lst:list):
    max_score = max(score_lst) # best score
    max_score_index=[i for i, j in enumerate(score_lst) if j == max_score] # indx with best score 
        
    return max_score_index[0]

In [10]:
def show_results(df,roc,f_var,s_var):
    print("\n------------Results-------------\n")
    print("Best model whene first param is {} and second param is {}".format(f_var,s_var))
    print("Roc auc = {}".format(roc))
    print("\nFeatures\n")
    for cnt,col in enumerate(df.columns):
        print("{}. {}".format(cnt+1,col))    

In [11]:
def forward_selection(x,y,model): # O(n) worst case scenario, where n depends on len(x.columns)
    
    def first_iteration(x,y,model):
        score_lst = []
        for i in range(len(x.columns)):
            k = x.columns[i]
            temp_x = x[[k]]
            score = get_roc_auc_score(temp_x,y,model)
            score_lst.append(score)
    
        max_score = max(score_lst) # best score
        max_score_index = [i for i, j in enumerate(score_lst) if j == max_score] # indx with best score
        top_score_index = max_score_index[0]
        new_feature = x.columns[top_score_index] 
        best_x = x[new_feature]
        new_x = x.drop(new_feature,axis=1)
        
        return best_x,new_x,max_score
    
    def else_iteration(best_x,x,y,model,actual_score):     
        new_x_lenght = len(x.columns)
        if (new_x_lenght > 0):
            score_lst = []
            for i in range(new_x_lenght):
                k = x.columns[i]
                temp_x = x[[k]]
                temp_new_x = pd.concat([best_x,temp_x],axis=1, ignore_index=True)
                score = get_roc_auc_score(temp_new_x,y,model)
                score_lst.append(score)

            max_score = max(score_lst) # best score

            if(max_score<actual_score):
                return best_x,actual_score # break condition, recursive function

            max_score_index = [i for i, j in enumerate(score_lst) if j == max_score] # indx with best score
            top_score_index = max_score_index[0]

            new_feature = x.columns[top_score_index]
            temp_x = x[new_feature]
            best_x = pd.concat([best_x,temp_x],axis=1)
            new_x = x.drop(new_feature,axis=1)

            return else_iteration(best_x,new_x,y,model,max_score)
        
        return best_x,actual_score
    
    f_best_x,new_x,f_score = first_iteration(x,y,model)
    best_x,best_score = else_iteration(f_best_x,new_x,y,model,f_score)
    
    return best_x,best_score

In [12]:
def inner_forward_partition_SVC(x,y,variable_prm_lst:list,static_prm): # (training data, prediction, variable list, static var)

    def else_iteration(x,y,variable_prm_lst:list,first_score,last_score,static_prm):       
        print("\ninner variable param list -> {}".format(variable_prm_lst))  
                           
        if (len(variable_prm_lst) == 2):
            print("\n---------inner param last iteration---------\n")
            first = variable_prm_lst[0]
            print("first is {}".format(first))
            last = variable_prm_lst[-1]
            print("last is {}".format(last))
            
            clf_first = SVC(C=static_prm,kernel='rbf',gamma=first,random_state=0)
            first_df,first_score = forward_selection(x,y,clf_first)
            clf_last = SVC(C=static_prm,kernel='rbf',gamma=last,random_state=0)
            last_df,last_score = forward_selection(x,y,clf_last)

            if (last_score > first_score):
                print("last score is greater than first score")
                show_results(last_df,last_score,static_prm,last)
                return last_df,last_score,last

            print("first score is greater than last score")
            show_results(first_df,first_score,static_prm,first)
            return first_df,first_score,first


        else:          
            first = variable_prm_lst[0]
            print("first is {}".format(first))
            middle = variable_prm_lst[(len(variable_prm_lst)//2)]
            print("middle is {}".format(middle))
            last = variable_prm_lst[-1]
            print("last is {}".format(last))
            
            clf_middle = SVC(C=static_prm,kernel='rbf',gamma=middle,random_state=0)
            _,new_middle_score = forward_selection(x,y,clf_middle)

            if ((first_score >= new_middle_score) and (last_score <= new_middle_score)):           
                print("sub list taken: left")
                print("\nout prms from this iteration => variable_prm_lst: {} | left score: {} | right score: {}".format(variable_prm_lst[:(len(variable_prm_lst)//2)+1],first_score,new_middle_score))
                return else_iteration(x,y,variable_prm_lst[:(len(variable_prm_lst)//2)+1],first_score,new_middle_score,static_prm)

            print("sub list taken: right")
            print("\nout prms from this iteration => variable_prm_lst: {} | left score: {} | right score: {}".format(variable_prm_lst[(len(variable_prm_lst)//2):],new_middle_score,last_score))
            return else_iteration(x,y,variable_prm_lst[(len(variable_prm_lst)//2):],new_middle_score,last_score,static_prm)
        
    def first_iteration(x,y,variable_prm_lst:list,static_prm):
        print("\ninner variable param list -> {}".format(variable_prm_lst))  
                 
        first = variable_prm_lst[0]
        print("first is {}".format(first))
        middle = variable_prm_lst[(len(variable_prm_lst)//2)]
        print("middle is {}".format(middle))
        last = variable_prm_lst[-1]
        print("last is {}".format(last))
        clf_first = SVC(C=static_prm,kernel='rbf',gamma=first,random_state=0)
        _,first_score = forward_selection(x,y,clf_first)       
        clf_middle = SVC(C=static_prm,kernel='rbf',gamma=middle,random_state=0)
        _,middle_score = forward_selection(x,y,clf_middle)
        clf_last = SVC(C=static_prm,kernel='rbf',gamma=last,random_state=0)
        _,last_score = forward_selection(x,y,clf_last)

        if ((first_score >= middle_score) and (last_score <= middle_score)):
            print("sub list taken: left")
            return variable_prm_lst[:(len(variable_prm_lst)//2)+1],first_score,middle_score

        print("sub list taken: right")
        return variable_prm_lst[(len(variable_prm_lst)//2):],middle_score,last_score
        
    if(len(variable_prm_lst) <= 1):
        print("Error, list len must be greater than 2")
        return None,0
    elif(len(variable_prm_lst) == 2):
        print("\nfirst param is {}".format(static_prm))
        best_df,best_score,best_variable_prm = else_iteration(x,y,variable_prm_lst,0,0,static_prm)
        return best_df,best_score,best_variable_prm                          
    else:
        print("\nouter param is {}".format(static_prm))
        new_variable_prm_lst,first_score,last_score = first_iteration(x,y,variable_prm_lst,static_prm)
        print("\nout prms from inner first iteration => variable_prm_lst: {} | left score: {} | right score: {}".format(new_variable_prm_lst,first_score,last_score))
        best_df,best_score,best_variable_prm = else_iteration(x,y,new_variable_prm_lst,first_score,last_score,static_prm)
        return best_df,best_score,best_variable_prm                          

In [13]:
def forward_partition_SVC(x,y,K:list,P:list): # (training data, prediction, outer variable list, inner variable list)
    
    def first_iteration(x,y,K:list,P:list):
        print("outer param list -> {}".format(K))  
                 
        first = K[0]
        print("first is {}".format(first))
        middle = K[(len(K)//2)]
        print("middle is {}".format(middle))
        last = K[-1]
        print("last is {}".format(last))
        print("\n**************************")
        _,first_score,first_best_p = inner_forward_partition_SVC(x,y,P,first)
        print("\n**************************")
        _,middle_score,middle_best_p = inner_forward_partition_SVC(x,y,P,middle)
        print("\n**************************")
        _,last_score,last_best_p = inner_forward_partition_SVC(x,y,P,last) 
        
        if ((first_score >= middle_score) and (last_score <= middle_score)):
            print("sub list taken: left")
            return K[:(len(K)//2)+1],first_score,middle_score

        print("sub list taken: right")
        return K[(len(K)//2):],middle_score,last_score
    
    def else_iteration(x,y,K:list,P:list,first_score,last_score):
        if(len(K) == 2):
            print("\n---------outer param last iteration---------\n")
            first = K[0]
            print("first is {}".format(first))
            last = K[-1]
            print("last is {}".format(last))
            print("\n**************************")
            first_df,first_score,first_best_p = inner_forward_partition_SVC(x,y,P,first)
            print("\n**************************")
            last_df,last_score,last_best_p = inner_forward_partition_SVC(x,y,P,last) 
            print("\n**************************")
            if (last_score > first_score):
                print("last score is greater than first score")
                return last_df,last_score,last,last_best_p

            print("first score is greater than last score")
            return first_df,first_score,first,first_best_p            
        else:
            print("first param list -> {}".format(K))  

            first = K[0]
            print("first is {}".format(first))
            middle = K[(len(K)//2)]
            print("middle is {}".format(middle))
            last = K[-1]
            print("last is {}".format(last))
            print("\n**************************")
            _,middle_score,middle_best_p = inner_forward_partition_SVC(x,y,P,middle) 
            print("\n**************************")
            if ((first_score >= middle_score) and (last_score <= middle_score)):
                print("sub list taken: left")
                return else_iteration(x,y,K[:(len(K)//2)+1],P,first_score,middle_score)

        print("sub list taken: right")
        return else_iteration(x,y,K[(len(K)//2):],P,middle_score,last_score)
    
    if(len(K) > 0 and len(K) <= 2):
        best_df,best_score,best_first_var,best_second_var = else_iteration(x,y,K,P,0,0)
        show_results(best_df,best_score,best_first_var,best_second_var)
        return best_df,best_score,best_first_var,best_second_var                          
    else:
        new_lst,first_score,last_score = first_iteration(x,y,K,P)
        print("\nout prms from outer first iteration => lst: {} | left score: {} | right score: {}".format(new_lst,first_score,last_score))
        best_df,best_score,best_first_var,best_second_var = else_iteration(x,y,new_lst,P,first_score,last_score)
        show_results(best_df,best_score,best_first_var,best_second_var)
        return best_df,best_score,best_first_var,best_second_var  

In [14]:
def guarantee_results(x,y,K:list,P:list):
    best_outer_prm_lst = []
    for i in K:
        print("First param is {}\n".format(i))
        best_inner_prm_lst = []
        for j in P:
            print("Second param is {}".format(j))
            clf = SVC(random_state=0, C=i,gamma=j) # linear model
            best_df,best_score = forward_selection(x,y,clf)
            print("ROC AUC = {}".format(best_score))
            print("\nFeatures\n")
            for c,d in enumerate(best_df.columns):
                print("{}. {}".format(c+1,d))       
            print("")
            best_inner_prm_lst.append([best_df,i,j,best_score])
            
        score_lst = [d  for [a,b,c,d] in best_inner_prm_lst]
        index = get_best_score_index(score_lst)
        best_outer_prm_lst.append([best_inner_prm_lst[index][0],best_inner_prm_lst[index][1],best_inner_prm_lst[index][2],best_inner_prm_lst[index][3]])
        
    score_lst = [d  for [a,b,c,d] in best_outer_prm_lst]
    index = get_best_score_index(score_lst)
    print("Final results")
    print("Best model when first param = {} ,second param = {} ,ROC AUC = {}".format(best_outer_prm_lst[index][1],best_outer_prm_lst[index][2],best_outer_prm_lst[index][3]))
    return best_outer_prm_lst[index][0]

# Data

In [15]:
unnecesary_features=["m_name"]
x,y = get_data_and_true_prediction(mixed_df,unnecesary_features)

In [16]:
C = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
Gamma = ['scale','auto',0.000001,0.00001,0.0001,0.001,0.01,0.1,1,5,10,15,20]

In [17]:
start = time.time()

best_df = forward_partition_SVC(x,y,C,Gamma)
end = time.time()

time.strftime('%H:%M:%S', time.gmtime(end-start))

outer param list -> [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
first is 1
middle is 16
last is 30

**************************

outer param is 1

inner variable param list -> ['scale', 'auto', 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 15, 20]
first is scale
middle is 0.01
last is 20
sub list taken: left

out prms from inner first iteration => variable_prm_lst: ['scale', 'auto', 1e-06, 1e-05, 0.0001, 0.001, 0.01] | left score: 0.8815535807194885 | right score: 0.8742225566952415

inner variable param list -> ['scale', 'auto', 1e-06, 1e-05, 0.0001, 0.001, 0.01]
first is scale
middle is 1e-05
last is 0.01
sub list taken: right

out prms from this iteration => variable_prm_lst: [1e-05, 0.0001, 0.001, 0.01] | left score: 0.8672450262623259 | right score: 0.8742225566952415

inner variable param list -> [1e-05, 0.0001, 0.001, 0.01]
first is 1e-05
middle is 0.001
last is 0.01
sub list taken: right

out prms from this 

sub list taken: right

out prms from inner first iteration => variable_prm_lst: [0.01, 0.1, 1, 5, 10, 15, 20] | left score: 0.9051250505838826 | right score: 0.7821103002205044

inner variable param list -> [0.01, 0.1, 1, 5, 10, 15, 20]
first is 0.01
middle is 5
last is 20
sub list taken: left

out prms from this iteration => variable_prm_lst: [0.01, 0.1, 1, 5] | left score: 0.9051250505838826 | right score: 0.8017883209041508

inner variable param list -> [0.01, 0.1, 1, 5]
first is 0.01
middle is 1
last is 5
sub list taken: left

out prms from this iteration => variable_prm_lst: [0.01, 0.1, 1] | left score: 0.9051250505838826 | right score: 0.8214707031572601

inner variable param list -> [0.01, 0.1, 1]
first is 0.01
middle is 0.1
last is 1
sub list taken: left

out prms from this iteration => variable_prm_lst: [0.01, 0.1] | left score: 0.9051250505838826 | right score: 0.8794495492872834

inner variable param list -> [0.01, 0.1]

---------inner param last iteration---------

first is

'01:20:13'

In [42]:
#new_df=get_df_with_name_and_prediction(best_df,y,mixed_df)
#save_df_to_disk(new_df,"best_classifier_SVC.csv")

C = 17
Gamma = 0.01
Tol = 0.001
ROC AUC = 0.9022

C = 29
Gamma = 0.01
Tol = 0.001
ROC AUC = 0.9051