# Basic imports

In [1]:
import pandas as pd
import numpy as np
import os
import operator
import statistics
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
current_path = os.getcwd()
root_path=current_path.replace('\\mutual_info_classif','')

# Loading dataframe

In [3]:
mixed_df=pd.read_csv(root_path+"\molecules.csv",sep="\t"); mixed_df.head()

Unnamed: 0,m_name,n_atoms_without_Hydrogen,n_atoms_with_Hydrogen,m_weight,m_avg_weigth,m_weigth_without_Hydrogen,n_radical_electrons,n_valence_electrons,n_aliphatic_carbocycles,n_aliphatic_heterocycles,...,n_Hydrogen_acceptors,n_Hydrogen_donnors,n_briged_head_atoms,n_atoms_stereo_centers,n_atoms_unspecified_stereo_centers,n_spiro_atoms,m_logp,m_mr,fraction_CSP3,is_cns_molecule
0,BUMETANIDE,25,45,364.109293,344.263,364.423,0,134,0,0,...,5,3,0,0,0,0,3.0365,94.6882,0.235294,0
1,BACLOFEN,14,26,213.055656,201.568,213.664,0,76,0,0,...,2,2,0,1,1,0,1.857,55.5002,0.3,1
2,METYRAPONE,17,31,226.110613,212.167,226.279,0,86,0,0,...,3,0,0,0,0,0,2.6371,65.8305,0.214286,0
3,METHYLPHENOBARBITAL,18,32,246.100442,232.154,246.266,0,94,0,1,...,3,1,0,1,1,0,1.0426,64.7197,0.307692,1
4,DULOXETINE,21,40,297.118735,278.271,297.423,0,108,0,0,...,3,1,0,1,1,0,4.6309,90.1797,0.222222,1


# Model and metrics import

In [4]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import *

In [5]:
data_frame=mixed_df.drop(["m_name"],axis=1)
y=data_frame["is_cns_molecule"]
x=data_frame.drop(["is_cns_molecule"],axis=1)

# Kbest from k=1 to k=all-1

In [9]:
def save_df_to_disk(df,name:str,separator="\t"):
    df.to_csv(name,sep=separator,index = False, header=True)

In [10]:
def get_df_with_name_and_prediction(df,true_prediction,big_df):
    new_df=df
    new_df.insert(0,"m_name",big_df["m_name"].values)
    new_df=new_df.join(true_prediction)
    
    return new_df

In [11]:
def get_new_df(features:list,data=x,true_prediction=y,big_df=mixed_df):
    
    def get_indexes(selection=features,df=data): 
        index_list=df.columns.get_indexer(selection)
        return index_list
    
    index_list=get_indexes()
    new_df=data.iloc[:, index_list]
    new_df=get_df_with_name_and_prediction(new_df,true_prediction,big_df)
       
    return new_df

In [12]:
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC

In [13]:
def selector_scores(selection_type,best_k,
                    data,true_prediction):
    
    selector = SelectKBest(selection_type, k=best_k)
    selector.fit(data,true_prediction)
    new_data=selector.transform(data)
    labels=list(data.columns[selector.get_support(indices=True)])
    new_df=pd.DataFrame(new_data,columns=labels)
    
    return new_df

In [14]:
def get_data_from_1_to_k_features(selection_type,best_k_limit,data,true_prediction):        
    data_lst=[]
    for i in range(1,best_k_limit+1):
        new_df=selector_scores(selection_type,i,data,true_prediction)
        data_lst.append(new_df)
        
    return data_lst

In [15]:
def get_best_model_index(data_lst,true_prediction,model):
    
    def get_roc_auc_result(data,true_prediction,model):
        cv_results = cross_validate(clf, data, true_prediction, cv=10,scoring=('roc_auc'))
        roc_auc_avrg=cv_results['test_score'].mean()
        
        return roc_auc_avrg    
    
    model_results_lst=[]
    for i,data in enumerate(data_lst):
        actual_data=data
        roc_auc_avrg=get_roc_auc_result(actual_data,true_prediction,model)
        print("Model #{} roc auc result = {}%".format(i,roc_auc_avrg*100))
        model_results_lst.append(roc_auc_avrg)
    
    max_value = max(model_results_lst)
    index=[i for i, j in enumerate(model_results_lst) if j == max_value]
    print("\nBest model roc auc = {}%".format(max_value*100))
    
    return index

In [16]:
clf = LinearSVC(random_state=0, tol=1e-5, dual=False)

In [17]:
data_lst=get_data_from_1_to_k_features(mutual_info_classif,len(x.columns)-1,x,y)

In [18]:
best_model_index=get_best_model_index(data_lst,y,clf)

Model #0 roc auc result = 74.45237527459821%
Model #1 roc auc result = 76.63790224509852%
Model #2 roc auc result = 78.72098911104504%
Model #3 roc auc result = 78.89676681243084%
Model #4 roc auc result = 78.73392542283997%
Model #5 roc auc result = 79.02345492046975%
Model #6 roc auc result = 78.9282281807971%
Model #7 roc auc result = 78.75975288224899%
Model #8 roc auc result = 78.90412857390616%
Model #9 roc auc result = 81.23214337330494%
Model #10 roc auc result = 80.96691865698759%
Model #11 roc auc result = 80.34333036023982%
Model #12 roc auc result = 80.23723066663364%
Model #13 roc auc result = 86.28382151941597%
Model #14 roc auc result = 85.87416949523478%
Model #15 roc auc result = 82.27080855755412%
Model #16 roc auc result = 86.47536151991147%
Model #17 roc auc result = 86.71891001024065%
Model #18 roc auc result = 84.74467011049998%
Model #19 roc auc result = 86.76667306707631%
Model #20 roc auc result = 87.15210418628082%
Model #21 roc auc result = 86.88891572931637%

In [19]:
best_model_index

[29]

In [20]:
for i in best_model_index:
   display(data_lst[i].head())      

Unnamed: 0,n_atoms_without_Hydrogen,n_atoms_with_Hydrogen,m_weight,m_avg_weigth,m_weigth_without_Hydrogen,n_radical_electrons,n_valence_electrons,n_aliphatic_heterocycles,n_aliphatic_rings,n_amide_bonds,...,n_HOH,n_O,n_Hydrogen_acceptors,n_Hydrogen_donnors,n_briged_head_atoms,n_atoms_stereo_centers,n_atoms_unspecified_stereo_centers,m_logp,m_mr,fraction_CSP3
0,25.0,45.0,364.109293,344.263,364.423,0.0,134.0,0.0,0.0,0.0,...,4.0,7.0,5.0,3.0,0.0,0.0,0.0,3.0365,94.6882,0.235294
1,14.0,26.0,213.055656,201.568,213.664,0.0,76.0,0.0,0.0,0.0,...,3.0,3.0,2.0,2.0,0.0,1.0,1.0,1.857,55.5002,0.3
2,17.0,31.0,226.110613,212.167,226.279,0.0,86.0,0.0,0.0,0.0,...,0.0,3.0,3.0,0.0,0.0,0.0,0.0,2.6371,65.8305,0.214286
3,18.0,32.0,246.100442,232.154,246.266,0.0,94.0,1.0,1.0,4.0,...,1.0,5.0,3.0,1.0,0.0,1.0,1.0,1.0426,64.7197,0.307692
4,21.0,40.0,297.118735,278.271,297.423,0.0,108.0,0.0,0.0,0.0,...,1.0,2.0,3.0,1.0,0.0,1.0,1.0,4.6309,90.1797,0.222222


In [21]:
best_df=get_df_with_name_and_prediction(data_lst[best_model_index[0]],y,mixed_df); best_df.head()

Unnamed: 0,m_name,n_atoms_without_Hydrogen,n_atoms_with_Hydrogen,m_weight,m_avg_weigth,m_weigth_without_Hydrogen,n_radical_electrons,n_valence_electrons,n_aliphatic_heterocycles,n_aliphatic_rings,...,n_O,n_Hydrogen_acceptors,n_Hydrogen_donnors,n_briged_head_atoms,n_atoms_stereo_centers,n_atoms_unspecified_stereo_centers,m_logp,m_mr,fraction_CSP3,is_cns_molecule
0,BUMETANIDE,25.0,45.0,364.109293,344.263,364.423,0.0,134.0,0.0,0.0,...,7.0,5.0,3.0,0.0,0.0,0.0,3.0365,94.6882,0.235294,0
1,BACLOFEN,14.0,26.0,213.055656,201.568,213.664,0.0,76.0,0.0,0.0,...,3.0,2.0,2.0,0.0,1.0,1.0,1.857,55.5002,0.3,1
2,METYRAPONE,17.0,31.0,226.110613,212.167,226.279,0.0,86.0,0.0,0.0,...,3.0,3.0,0.0,0.0,0.0,0.0,2.6371,65.8305,0.214286,0
3,METHYLPHENOBARBITAL,18.0,32.0,246.100442,232.154,246.266,0.0,94.0,1.0,1.0,...,5.0,3.0,1.0,0.0,1.0,1.0,1.0426,64.7197,0.307692,1
4,DULOXETINE,21.0,40.0,297.118735,278.271,297.423,0.0,108.0,0.0,0.0,...,2.0,3.0,1.0,0.0,1.0,1.0,4.6309,90.1797,0.222222,1


In [23]:
#save_df_to_disk(best_df,"mic_best.csv") # 88.5803