# Basic imports

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
current_path = os.getcwd()
root_path=current_path.replace('\\f_classif','')

# Loading dataframes

In [3]:
e_df=pd.read_csv(root_path+"\e_molecules.csv", sep="\t")
em_df=pd.read_csv("f_classif_em_molecules.csv", sep="\t")
emp_df=pd.read_csv("f_classif_emp_molecules.csv", sep="\t")
best_df=pd.read_csv("f_classif_best.csv", sep="\t")

# Model imports

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

# Linear model

 When dual=False the underlying implementation of LinearSVC is not random and random_state has no effect on the results.

In [5]:
clf = LinearSVC(random_state=0, tol=1e-5, dual=False)

In [6]:
def linear_model(model,data,prediction,testing_size=0.2):
    
    def print_model_scores(y_test,predicted):
        matrix=confusion_matrix(y_test,predicted)
        matrix_labels=[["True positive","False positive"],
                       ["False negative","True negative"]]

        for i in range(2):
            for j in range(2):
                print("{} {}".format(matrix_labels[i][j],matrix[i][j]))

        print("\nf1 score: {}%".format(f1_score(y_test,predicted)*100)) 
        print("accuracy score: {}%".format(accuracy_score(y_test,predicted)*100)) 
    
    x_train, x_test, y_train, y_test=train_test_split(data,prediction,random_state=0,test_size=testing_size)    
    clf.fit(x_train,y_train)
    predicted=clf.predict(x_test)
    print_model_scores(y_test,predicted)   

## Experimental features

In [7]:
temp_e_df=e_df.drop(["m_name"],axis=1)
e_y=temp_e_df["is_cns_molecule"]
e_x=temp_e_df.drop(["is_cns_molecule"],axis=1)

In [8]:
linear_model(clf,e_x,e_y)

True positive 100
False positive 13
False negative 42
True negative 33

f1 score: 54.54545454545454%
accuracy score: 70.74468085106383%


## Experimental + must have features

In [9]:
temp_em_df=em_df.drop(["m_name"],axis=1)
em_y=temp_em_df["is_cns_molecule"]
em_x=temp_em_df.drop(["is_cns_molecule"],axis=1)

In [10]:
linear_model(clf,em_x,em_y)

True positive 101
False positive 12
False negative 30
True negative 45

f1 score: 68.18181818181819%
accuracy score: 77.6595744680851%


##  Experimental + must have features + possible features

In [11]:
temp_emp_df=emp_df.drop(["m_name"],axis=1)
emp_y=temp_emp_df["is_cns_molecule"]
emp_x=temp_emp_df.drop(["is_cns_molecule"],axis=1)

In [12]:
linear_model(clf,emp_x,emp_y)

True positive 99
False positive 14
False negative 29
True negative 46

f1 score: 68.14814814814815%
accuracy score: 77.12765957446808%


# Best

In [13]:
temp_best_df=best_df.drop(["m_name"],axis=1)
best_y=temp_best_df["is_cns_molecule"]
best_x=temp_best_df.drop(["is_cns_molecule"],axis=1)

In [14]:
linear_model(clf,best_x,best_y)

True positive 106
False positive 7
False negative 28
True negative 47

f1 score: 72.86821705426357%
accuracy score: 81.38297872340425%


# CV linear model

In [15]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

In [16]:
def print_results(cv):
    keys=list(sorted(cv.keys()))
    for i in keys:
        print("Average: {}".format(round(cv[i].mean(),4)))
        print("{} -> {}".format(i,cv[i]),end="\n\n")

In [17]:
clf = LinearSVC(random_state=0, tol=1e-5, dual=False)

## Experimental features

In [18]:
cv_results_e = cross_validate(clf, e_x, e_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_e)

Average: 0.0032
fit_time -> [0.00402975 0.00299239 0.00299478 0.00394988 0.00297546 0.00299191
 0.00299239 0.00299239 0.00299215 0.00299168]

Average: 0.0074
score_time -> [0.00897145 0.00797915 0.00698185 0.00700617 0.00698185 0.00797796
 0.00698233 0.00698256 0.00695634 0.00698018]

Average: 0.7127
test_accuracy -> [0.70526316 0.72631579 0.72631579 0.68421053 0.74468085 0.75531915
 0.64516129 0.70967742 0.72043011 0.70967742]

Average: 0.6259
test_average_precision -> [0.60377854 0.69527613 0.55523182 0.62929819 0.65572762 0.69821848
 0.560444   0.67506146 0.63457986 0.55150016]

Average: 0.5299
test_f1 -> [0.5        0.53571429 0.58064516 0.46428571 0.52       0.59649123
 0.42105263 0.55737705 0.55172414 0.57142857]

Average: 0.3621
test_jaccard -> [0.33333333 0.36585366 0.40909091 0.30232558 0.35135135 0.425
 0.26666667 0.38636364 0.38095238 0.4       ]

Average: 0.4845
test_recall -> [0.4375     0.46875    0.5625     0.40625    0.40625    0.53125
 0.38709677 0.5483871  0.51612903 

##  Experimental + must have features

In [19]:
cv_results_em = cross_validate(clf, em_x, em_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_em)

Average: 0.0044
fit_time -> [0.00700498 0.00502753 0.0039916  0.00397849 0.00400257 0.00397682
 0.00398898 0.00398946 0.00398946 0.00398993]

Average: 0.0073
score_time -> [0.00795436 0.00797319 0.0069797  0.0079782  0.00698113 0.00698137
 0.00698113 0.00698113 0.00698113 0.00698137]

Average: 0.7617
test_accuracy -> [0.76842105 0.72631579 0.77894737 0.75789474 0.74468085 0.79787234
 0.7311828  0.76344086 0.79569892 0.75268817]

Average: 0.6924
test_average_precision -> [0.70780134 0.75452623 0.59872542 0.61764345 0.68471678 0.74175445
 0.68665269 0.70384293 0.74563918 0.68249822]

Average: 0.6154
test_f1 -> [0.63333333 0.53571429 0.69565217 0.62295082 0.55555556 0.6984127
 0.52830189 0.60714286 0.66666667 0.61016949]

Average: 0.447
test_jaccard -> [0.46341463 0.36585366 0.53333333 0.45238095 0.38461538 0.53658537
 0.35897436 0.43589744 0.5        0.43902439]

Average: 0.5756
test_recall -> [0.59375    0.46875    0.75       0.59375    0.46875    0.6875
 0.4516129  0.5483871  0.6129032

## Experimental + must have features + possible features

In [20]:
cv_results_emp = cross_validate(clf, emp_x, emp_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_emp)

Average: 0.0067
fit_time -> [0.01000667 0.00798631 0.00594735 0.00598526 0.00598431 0.00598502
 0.00598192 0.00598335 0.00698113 0.00598097]

Average: 0.0075
score_time -> [0.00897455 0.00797486 0.00801969 0.0069685  0.00698113 0.00797868
 0.00698161 0.00698137 0.00698161 0.00698066]

Average: 0.7767
test_accuracy -> [0.73684211 0.77894737 0.8        0.74736842 0.73404255 0.82978723
 0.75268817 0.79569892 0.80645161 0.78494624]

Average: 0.715
test_average_precision -> [0.71722328 0.76990851 0.62842304 0.60069288 0.70756533 0.79553289
 0.6836331  0.74325698 0.81444399 0.68931132]

Average: 0.644
test_f1 -> [0.56140351 0.63157895 0.71641791 0.64705882 0.54545455 0.73333333
 0.58181818 0.6779661  0.67857143 0.66666667]

Average: 0.4779
test_jaccard -> [0.3902439  0.46153846 0.55813953 0.47826087 0.375      0.57894737
 0.41025641 0.51282051 0.51351351 0.5       ]

Average: 0.6076
test_recall -> [0.5        0.5625     0.75       0.6875     0.46875    0.6875
 0.51612903 0.64516129 0.6129032

# Best

In [21]:
cv_results_best = cross_validate(clf, best_x, best_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_best)

Average: 0.0119
fit_time -> [0.01299214 0.01196814 0.01196671 0.01200771 0.01296496 0.01396465
 0.01097322 0.01095772 0.009974   0.01097107]

Average: 0.0072
score_time -> [0.00797939 0.00797963 0.00894976 0.00897694 0.00697875 0.00696731
 0.00598383 0.00598383 0.00598812 0.00598502]

Average: 0.8234
test_accuracy -> [0.78947368 0.85263158 0.83157895 0.84210526 0.80851064 0.82978723
 0.76344086 0.84946237 0.86021505 0.80645161]

Average: 0.7902
test_average_precision -> [0.80737829 0.82520484 0.71593138 0.71134844 0.79673592 0.72099678
 0.76280949 0.88538235 0.85271397 0.82354427]

Average: 0.7207
test_f1 -> [0.64285714 0.75862069 0.75757576 0.78873239 0.68965517 0.75
 0.60714286 0.74074074 0.77192982 0.7       ]

Average: 0.5663
test_jaccard -> [0.47368421 0.61111111 0.6097561  0.65116279 0.52631579 0.6
 0.43589744 0.58823529 0.62857143 0.53846154]

Average: 0.6862
test_recall -> [0.5625     0.6875     0.78125    0.875      0.625      0.75
 0.5483871  0.64516129 0.70967742 0.67741935]