# Basic imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams.update({'figure.max_open_warning': 0})

# Loading dataframes

In [2]:
e_df=pd.read_csv("e_molecules.csv", sep="\t")
em_df=pd.read_csv("em_molecules.csv", sep="\t")
emp_df=pd.read_csv("emp_molecules.csv", sep="\t")
best_k10_df=pd.read_csv("best_k10.csv", sep="\t")
best_k15_df=pd.read_csv("best_k15.csv", sep="\t")
best_k20_df=pd.read_csv("best_k20.csv", sep="\t")
best_k25_df=pd.read_csv("best_k25.csv", sep="\t")

# Model imports

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

# Linear model

 When dual=False the underlying implementation of LinearSVC is not random and random_state has no effect on the results.

In [4]:
clf = LinearSVC(random_state=0, tol=1e-5, dual=False)

In [5]:
def linear_model(model,data,prediction,testing_size=0.2):
    
    def print_model_scores(y_test,predicted):
        matrix=confusion_matrix(y_test,predicted)
        matrix_labels=[["True positive","False positive"],
                       ["False negative","True negative"]]

        for i in range(2):
            for j in range(2):
                print("{} {}".format(matrix_labels[i][j],matrix[i][j]))

        print("\nf1 score: {}%".format(f1_score(y_test,predicted)*100)) 
        print("accuracy score: {}%".format(accuracy_score(y_test,predicted)*100)) 
    
    x_train, x_test, y_train, y_test=train_test_split(data,prediction,random_state=0,test_size=testing_size)    
    clf.fit(x_train,y_train)
    predicted=clf.predict(x_test)
    print_model_scores(y_test,predicted)   

## Experimental features

In [6]:
temp_e_df=e_df.drop(["m_name"],axis=1)
e_y=temp_e_df["is_cns_molecule"]
e_x=temp_e_df.drop(["is_cns_molecule"],axis=1)

In [7]:
linear_model(clf,e_x,e_y)

True positive 100
False positive 13
False negative 42
True negative 33

f1 score: 54.54545454545454%
accuracy score: 70.74468085106383%


## Experimental + must have features

In [8]:
temp_em_df=em_df.drop(["m_name"],axis=1)
em_y=temp_em_df["is_cns_molecule"]
em_x=temp_em_df.drop(["is_cns_molecule"],axis=1)

In [9]:
linear_model(clf,em_x,em_y)

True positive 101
False positive 12
False negative 30
True negative 45

f1 score: 68.18181818181819%
accuracy score: 77.6595744680851%


##  Experimental + must have features + possible features

In [10]:
temp_emp_df=emp_df.drop(["m_name"],axis=1)
emp_y=temp_emp_df["is_cns_molecule"]
emp_x=temp_emp_df.drop(["is_cns_molecule"],axis=1)

In [11]:
linear_model(clf,emp_x,emp_y)

True positive 99
False positive 14
False negative 29
True negative 46

f1 score: 68.14814814814815%
accuracy score: 77.12765957446808%


## k10 features

In [12]:
temp_k10_df=best_k10_df.drop(["m_name"],axis=1)
k10_y=temp_k10_df["is_cns_molecule"]
k10_x=temp_k10_df.drop(["is_cns_molecule"],axis=1)

In [13]:
linear_model(clf,k10_x,k10_y)

True positive 101
False positive 12
False negative 40
True negative 35

f1 score: 57.377049180327866%
accuracy score: 72.3404255319149%


## k15 features

In [14]:
temp_k15_df=best_k15_df.drop(["m_name"],axis=1)
k15_y=temp_k15_df["is_cns_molecule"]
k15_x=temp_k15_df.drop(["is_cns_molecule"],axis=1)

In [15]:
linear_model(clf,k15_x,k15_y)

True positive 102
False positive 11
False negative 36
True negative 39

f1 score: 62.4%
accuracy score: 75.0%


## k20 features

In [16]:
temp_k20_df=best_k20_df.drop(["m_name"],axis=1)
k20_y=temp_k20_df["is_cns_molecule"]
k20_x=temp_k20_df.drop(["is_cns_molecule"],axis=1)

In [17]:
linear_model(clf,k20_x,k20_y)

True positive 105
False positive 8
False negative 30
True negative 45

f1 score: 70.31249999999999%
accuracy score: 79.7872340425532%


## k25 features

In [18]:
temp_k25_df=best_k25_df.drop(["m_name"],axis=1)
k25_y=temp_k25_df["is_cns_molecule"]
k25_x=temp_k25_df.drop(["is_cns_molecule"],axis=1)

In [19]:
linear_model(clf,k25_x,k25_y)

True positive 104
False positive 9
False negative 29
True negative 46

f1 score: 70.76923076923075%
accuracy score: 79.7872340425532%


# CV linear model

In [20]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

In [21]:
def print_results(cv):
    keys=list(sorted(cv.keys()))
    for i in keys:
        print("Average: {}".format(round(cv[i].mean(),4)))
        print("{} -> {}".format(i,cv[i]),end="\n\n")

In [22]:
clf = LinearSVC(random_state=0, tol=1e-5, dual=False)

## Experimental features

In [23]:
cv_results_e = cross_validate(clf, e_x, e_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_e)

Average: 0.0043
fit_time -> [0.00598836 0.00498629 0.0039866  0.00399041 0.00397372 0.00295281
 0.00399065 0.00498581 0.0039897  0.00398827]

Average: 0.0107
score_time -> [0.01196384 0.01197028 0.01097083 0.00897717 0.00897694 0.01096869
 0.00997329 0.00997305 0.01196885 0.01097059]

Average: 0.7127
test_accuracy -> [0.70526316 0.72631579 0.72631579 0.68421053 0.74468085 0.75531915
 0.64516129 0.70967742 0.72043011 0.70967742]

Average: 0.6259
test_average_precision -> [0.60377854 0.69527613 0.55523182 0.62929819 0.65572762 0.69821848
 0.560444   0.67506146 0.63457986 0.55150016]

Average: 0.5299
test_f1 -> [0.5        0.53571429 0.58064516 0.46428571 0.52       0.59649123
 0.42105263 0.55737705 0.55172414 0.57142857]

Average: 0.3621
test_jaccard -> [0.33333333 0.36585366 0.40909091 0.30232558 0.35135135 0.425
 0.26666667 0.38636364 0.38095238 0.4       ]

Average: 0.4845
test_recall -> [0.4375     0.46875    0.5625     0.40625    0.40625    0.53125
 0.38709677 0.5483871  0.51612903 

##  Experimental + must have features

In [24]:
cv_results_em = cross_validate(clf, em_x, em_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_em)

Average: 0.0049
fit_time -> [0.00799012 0.0039885  0.004987   0.00501704 0.004951   0.00497699
 0.0039897  0.00399041 0.00402045 0.00495577]

Average: 0.0089
score_time -> [0.0109973  0.00898004 0.00893354 0.00898719 0.00800896 0.00898695
 0.00997734 0.00894189 0.00795031 0.00702047]

Average: 0.7617
test_accuracy -> [0.76842105 0.72631579 0.77894737 0.75789474 0.74468085 0.79787234
 0.7311828  0.76344086 0.79569892 0.75268817]

Average: 0.6924
test_average_precision -> [0.70780134 0.75452623 0.59872542 0.61764345 0.68471678 0.74175445
 0.68665269 0.70384293 0.74563918 0.68249822]

Average: 0.6154
test_f1 -> [0.63333333 0.53571429 0.69565217 0.62295082 0.55555556 0.6984127
 0.52830189 0.60714286 0.66666667 0.61016949]

Average: 0.447
test_jaccard -> [0.46341463 0.36585366 0.53333333 0.45238095 0.38461538 0.53658537
 0.35897436 0.43589744 0.5        0.43902439]

Average: 0.5756
test_recall -> [0.59375    0.46875    0.75       0.59375    0.46875    0.6875
 0.4516129  0.5483871  0.6129032

## Experimental + must have features + possible features

In [25]:
cv_results_emp = cross_validate(clf, emp_x, emp_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_emp)

Average: 0.007
fit_time -> [0.01000476 0.00797796 0.00698423 0.00596809 0.00698113 0.00695562
 0.00598335 0.00598359 0.00698113 0.00598359]

Average: 0.0077
score_time -> [0.00898123 0.00698304 0.00798011 0.0079782  0.00698113 0.00698161
 0.00798059 0.00797892 0.00797772 0.00698185]

Average: 0.7767
test_accuracy -> [0.73684211 0.77894737 0.8        0.74736842 0.73404255 0.82978723
 0.75268817 0.79569892 0.80645161 0.78494624]

Average: 0.715
test_average_precision -> [0.71722328 0.76990851 0.62842304 0.60069288 0.70756533 0.79553289
 0.6836331  0.74325698 0.81444399 0.68931132]

Average: 0.644
test_f1 -> [0.56140351 0.63157895 0.71641791 0.64705882 0.54545455 0.73333333
 0.58181818 0.6779661  0.67857143 0.66666667]

Average: 0.4779
test_jaccard -> [0.3902439  0.46153846 0.55813953 0.47826087 0.375      0.57894737
 0.41025641 0.51282051 0.51351351 0.5       ]

Average: 0.6076
test_recall -> [0.5        0.5625     0.75       0.6875     0.46875    0.6875
 0.51612903 0.64516129 0.61290323

## k10 features

In [26]:
cv_results_k10 = cross_validate(clf, k10_x, k10_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_k10)

Average: 0.0044
fit_time -> [0.00598478 0.00299144 0.00498843 0.00395679 0.0039885  0.00399041
 0.00398993 0.00399375 0.00502062 0.00501823]

Average: 0.008
score_time -> [0.00900173 0.00796294 0.00799966 0.00800395 0.00698209 0.00699329
 0.00696802 0.00994515 0.00794458 0.00797987]

Average: 0.7309
test_accuracy -> [0.70526316 0.70526316 0.74736842 0.73684211 0.75531915 0.75531915
 0.7311828  0.72043011 0.74193548 0.70967742]

Average: 0.6412
test_average_precision -> [0.55818172 0.71480925 0.58186288 0.60618361 0.68220642 0.73140935
 0.60708669 0.67354885 0.68975452 0.56685164]

Average: 0.5661
test_f1 -> [0.51724138 0.5        0.61290323 0.57627119 0.54901961 0.58181818
 0.54545455 0.59375    0.6        0.58461538]

Average: 0.3956
test_jaccard -> [0.34883721 0.33333333 0.44186047 0.4047619  0.37837838 0.41025641
 0.375      0.42222222 0.42857143 0.41304348]

Average: 0.5259
test_recall -> [0.46875    0.4375     0.59375    0.53125    0.4375     0.5
 0.48387097 0.61290323 0.58064516 

## k15 features

In [27]:
cv_results_k15 = cross_validate(clf, k15_x, k15_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_k15)

Average: 0.0087
fit_time -> [0.00896978 0.00793958 0.00897598 0.0089457  0.00897574 0.00797939
 0.00797915 0.00793266 0.00997257 0.00897598]

Average: 0.0081
score_time -> [0.00801754 0.00901461 0.00797915 0.00801015 0.00797772 0.00797772
 0.00798488 0.00897717 0.00797868 0.00694871]

Average: 0.7724
test_accuracy -> [0.8        0.74736842 0.75789474 0.77894737 0.75531915 0.78723404
 0.75268817 0.78494624 0.79569892 0.76344086]

Average: 0.6986
test_average_precision -> [0.65365005 0.75341956 0.64649845 0.66426913 0.67217258 0.74917118
 0.68426946 0.69157904 0.77649357 0.69430372]

Average: 0.6355
test_f1 -> [0.65454545 0.53846154 0.64615385 0.6557377  0.59649123 0.6875
 0.61016949 0.64285714 0.6779661  0.64516129]

Average: 0.4671
test_jaccard -> [0.48648649 0.36842105 0.47727273 0.48780488 0.425      0.52380952
 0.43902439 0.47368421 0.51282051 0.47619048]

Average: 0.5952
test_recall -> [0.5625     0.4375     0.65625    0.625      0.53125    0.6875
 0.58064516 0.58064516 0.64516129 

## k20 features

In [28]:
cv_results_k20 = cross_validate(clf, k20_x, k20_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_k20)

Average: 0.0087
fit_time -> [0.01201057 0.00893974 0.00897622 0.00797701 0.00897527 0.00797844
 0.00698042 0.0079782  0.00897551 0.00797868]

Average: 0.0073
score_time -> [0.00797272 0.00797987 0.00797844 0.00698376 0.00797868 0.00698042
 0.00698209 0.00698185 0.00698161 0.00598407]

Average: 0.8106
test_accuracy -> [0.76842105 0.85263158 0.78947368 0.83157895 0.82978723 0.79787234
 0.76344086 0.84946237 0.83870968 0.78494624]

Average: 0.774
test_average_precision -> [0.7816315  0.81619779 0.68153881 0.76559053 0.80196239 0.71553541
 0.75039022 0.84187963 0.82289736 0.76195225]

Average: 0.7016
test_f1 -> [0.60714286 0.75       0.6969697  0.76470588 0.71428571 0.6984127
 0.62068966 0.75       0.73684211 0.67741935]

Average: 0.5427
test_jaccard -> [0.43589744 0.6        0.53488372 0.61904762 0.55555556 0.53658537
 0.45       0.6        0.58333333 0.51219512]

Average: 0.6644
test_recall -> [0.53125    0.65625    0.71875    0.8125     0.625      0.6875
 0.58064516 0.67741935 0.6774193

## k25 features

In [29]:
cv_results_k25 = cross_validate(clf, k25_x, k25_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_k25)

Average: 0.0127
fit_time -> [0.01396132 0.01096892 0.01397181 0.01296473 0.0119679  0.01296496
 0.01296353 0.01196551 0.01196766 0.01296473]

Average: 0.0085
score_time -> [0.00801444 0.00698066 0.00793767 0.00997257 0.00797868 0.00797892
 0.00897908 0.00997329 0.00797892 0.00897813]

Average: 0.8192
test_accuracy -> [0.78947368 0.83157895 0.83157895 0.8        0.79787234 0.81914894
 0.79569892 0.88172043 0.84946237 0.79569892]

Average: 0.7904
test_average_precision -> [0.79760761 0.86590267 0.71693667 0.68819067 0.79921214 0.71234196
 0.77896491 0.88099437 0.85173296 0.81229655]

Average: 0.7175
test_f1 -> [0.66666667 0.72413793 0.75757576 0.72463768 0.66666667 0.73015873
 0.66666667 0.8        0.75       0.68852459]

Average: 0.5612
test_jaccard -> [0.5        0.56756757 0.6097561  0.56818182 0.5        0.575
 0.5        0.66666667 0.6        0.525     ]

Average: 0.6834
test_recall -> [0.625      0.65625    0.78125    0.78125    0.59375    0.71875
 0.61290323 0.70967742 0.67741935 