# Basic imports

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 
plt.rcParams.update({'figure.max_open_warning': 0})

In [2]:
current_path = os.getcwd()
root_path=current_path.replace('\\f_classif','')

# Loading dataframes

In [3]:
e_df=pd.read_csv(root_path+"\e_molecules.csv", sep="\t")
em_df=pd.read_csv("f_classif_em_molecules.csv", sep="\t")
emp_df=pd.read_csv("f_classif_emp_molecules.csv", sep="\t")
best_df=pd.read_csv("f_classif_best.csv", sep="\t")

# Model imports

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

# Linear model

 When dual=False the underlying implementation of LinearSVC is not random and random_state has no effect on the results.

In [5]:
clf = LinearSVC(random_state=0, tol=1e-5, dual=False)

In [6]:
def linear_model(model,data,prediction,testing_size=0.2):
    
    def print_model_scores(y_test,predicted):
        matrix=confusion_matrix(y_test,predicted)
        matrix_labels=[["True positive","False positive"],
                       ["False negative","True negative"]]

        for i in range(2):
            for j in range(2):
                print("{} {}".format(matrix_labels[i][j],matrix[i][j]))

        print("\nf1 score: {}%".format(f1_score(y_test,predicted)*100)) 
        print("accuracy score: {}%".format(accuracy_score(y_test,predicted)*100)) 
    
    x_train, x_test, y_train, y_test=train_test_split(data,prediction,random_state=0,test_size=testing_size)    
    clf.fit(x_train,y_train)
    predicted=clf.predict(x_test)
    print_model_scores(y_test,predicted)   

## Experimental features

In [7]:
temp_e_df=e_df.drop(["m_name"],axis=1)
e_y=temp_e_df["is_cns_molecule"]
e_x=temp_e_df.drop(["is_cns_molecule"],axis=1)

In [8]:
linear_model(clf,e_x,e_y)

True positive 101
False positive 12
False negative 30
True negative 45

f1 score: 68.18181818181819%
accuracy score: 77.6595744680851%


## Experimental + must have features

In [9]:
temp_em_df=em_df.drop(["m_name"],axis=1)
em_y=temp_em_df["is_cns_molecule"]
em_x=temp_em_df.drop(["is_cns_molecule"],axis=1)

In [10]:
linear_model(clf,em_x,em_y)

True positive 101
False positive 12
False negative 30
True negative 45

f1 score: 68.18181818181819%
accuracy score: 77.6595744680851%


##  Experimental + must have features + possible features

In [11]:
temp_emp_df=emp_df.drop(["m_name"],axis=1)
emp_y=temp_emp_df["is_cns_molecule"]
emp_x=temp_emp_df.drop(["is_cns_molecule"],axis=1)

In [12]:
linear_model(clf,emp_x,emp_y)

True positive 98
False positive 15
False negative 28
True negative 47

f1 score: 68.61313868613139%
accuracy score: 77.12765957446808%


# Best

In [13]:
temp_best_df=best_df.drop(["m_name"],axis=1)
best_y=temp_best_df["is_cns_molecule"]
best_x=temp_best_df.drop(["is_cns_molecule"],axis=1)

In [14]:
linear_model(clf,best_x,best_y)

True positive 106
False positive 7
False negative 28
True negative 47

f1 score: 72.86821705426357%
accuracy score: 81.38297872340425%


# CV linear model

In [15]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

In [16]:
def print_results(cv):
    keys=list(sorted(cv.keys()))
    for i in keys:
        print("Average: {}".format(round(cv[i].mean(),4)))
        print("{} -> {}".format(i,cv[i]),end="\n\n")

In [17]:
clf = LinearSVC(random_state=0, tol=1e-5, dual=False)

## Experimental features

In [18]:
cv_results_e = cross_validate(clf, e_x, e_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_e)

Average: 0.0035
fit_time -> [0.00497794 0.00299191 0.00398874 0.00299168 0.00299191 0.00299168
 0.0039897  0.00299168 0.00299191 0.00398898]

Average: 0.0076
score_time -> [0.00897646 0.00797892 0.00698161 0.00797868 0.00797844 0.00698137
 0.00797892 0.00698161 0.00698137 0.00698185]

Average: 0.7606
test_accuracy -> [0.76842105 0.72631579 0.77894737 0.75789474 0.74468085 0.79787234
 0.72043011 0.76344086 0.79569892 0.75268817]

Average: 0.693
test_average_precision -> [0.71120921 0.75452623 0.60094624 0.6205017  0.68512058 0.74060313
 0.697981   0.70164351 0.74325273 0.67446673]

Average: 0.6126
test_f1 -> [0.63333333 0.53571429 0.69565217 0.62295082 0.55555556 0.6984127
 0.5        0.60714286 0.66666667 0.61016949]

Average: 0.4444
test_jaccard -> [0.46341463 0.36585366 0.53333333 0.45238095 0.38461538 0.53658537
 0.33333333 0.43589744 0.5        0.43902439]

Average: 0.5724
test_recall -> [0.59375    0.46875    0.75       0.59375    0.46875    0.6875
 0.41935484 0.5483871  0.6129032

##  Experimental + must have features

In [19]:
cv_results_em = cross_validate(clf, em_x, em_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_em)

Average: 0.0036
fit_time -> [0.00597286 0.00399089 0.00398922 0.00299191 0.0029912  0.00299191
 0.00398803 0.00299168 0.00299168 0.00299191]

Average: 0.0076
score_time -> [0.00797749 0.00797725 0.00800467 0.00795364 0.00698161 0.00897717
 0.00698161 0.00698161 0.00698161 0.00698137]

Average: 0.7617
test_accuracy -> [0.76842105 0.72631579 0.77894737 0.75789474 0.74468085 0.79787234
 0.7311828  0.76344086 0.79569892 0.75268817]

Average: 0.6922
test_average_precision -> [0.70631775 0.75452623 0.59872542 0.61764345 0.68471678 0.74175445
 0.68622402 0.70384293 0.74563918 0.68249822]

Average: 0.6154
test_f1 -> [0.63333333 0.53571429 0.69565217 0.62295082 0.55555556 0.6984127
 0.52830189 0.60714286 0.66666667 0.61016949]

Average: 0.447
test_jaccard -> [0.46341463 0.36585366 0.53333333 0.45238095 0.38461538 0.53658537
 0.35897436 0.43589744 0.5        0.43902439]

Average: 0.5756
test_recall -> [0.59375    0.46875    0.75       0.59375    0.46875    0.6875
 0.4516129  0.5483871  0.6129032

## Experimental + must have features + possible features

In [20]:
cv_results_emp = cross_validate(clf, emp_x, emp_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_emp)

Average: 0.0078
fit_time -> [0.01096988 0.0069809  0.0079782  0.00598383 0.0069809  0.00897527
 0.0079782  0.00698066 0.00700784 0.0079782 ]

Average: 0.0072
score_time -> [0.00698137 0.00698185 0.00698161 0.00797892 0.00797939 0.00698161
 0.00698209 0.00698137 0.00698161 0.0069828 ]

Average: 0.782
test_accuracy -> [0.74736842 0.76842105 0.78947368 0.78947368 0.72340426 0.81914894
 0.76344086 0.82795699 0.80645161 0.78494624]

Average: 0.7135
test_average_precision -> [0.70973517 0.77556357 0.60977542 0.60683099 0.70256433 0.77345534
 0.69101679 0.74918824 0.81991303 0.69715822]

Average: 0.6533
test_f1 -> [0.5862069  0.60714286 0.70588235 0.70588235 0.51851852 0.73846154
 0.59259259 0.73333333 0.67857143 0.66666667]

Average: 0.489
test_jaccard -> [0.41463415 0.43589744 0.54545455 0.54545455 0.35       0.58536585
 0.42105263 0.57894737 0.51351351 0.5       ]

Average: 0.6234
test_recall -> [0.53125    0.53125    0.75       0.75       0.4375     0.75
 0.51612903 0.70967742 0.61290323 

# Best

In [21]:
cv_results_best = cross_validate(clf, best_x, best_y, cv=10,scoring=('accuracy','average_precision',
                                                           'f1','jaccard','recall','roc_auc'))
print_results(cv_results_best)

Average: 0.0106
fit_time -> [0.01495671 0.01097012 0.00997281 0.01098013 0.01097035 0.01001382
 0.00897598 0.00997186 0.00897646 0.00997329]

Average: 0.0064
score_time -> [0.00698161 0.00598454 0.00598454 0.00597453 0.00698161 0.00698018
 0.00598526 0.00598407 0.00598311 0.00698066]

Average: 0.8234
test_accuracy -> [0.78947368 0.85263158 0.83157895 0.84210526 0.80851064 0.82978723
 0.76344086 0.84946237 0.86021505 0.80645161]

Average: 0.7903
test_average_precision -> [0.80737829 0.82520484 0.71593138 0.71134844 0.79673592 0.72086982
 0.76280949 0.88538235 0.85271397 0.82470511]

Average: 0.7207
test_f1 -> [0.64285714 0.75862069 0.75757576 0.78873239 0.68965517 0.75
 0.60714286 0.74074074 0.77192982 0.7       ]

Average: 0.5663
test_jaccard -> [0.47368421 0.61111111 0.6097561  0.65116279 0.52631579 0.6
 0.43589744 0.58823529 0.62857143 0.53846154]

Average: 0.6862
test_recall -> [0.5625     0.6875     0.78125    0.875      0.625      0.75
 0.5483871  0.64516129 0.70967742 0.67741935]