# Basic imports

In [1]:
import pandas as pd

# Reading the csv files

In [2]:
molecules_zero_df=pd.read_csv("molecules_v1.csv", sep="\t")
molecules_non_zero_df=pd.read_csv("molecules_v2.csv", sep="\t")

# Model imports

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate

# Splitting the data

In [4]:
y1=molecules_zero_df["is_cns_molecule"]
x1=molecules_zero_df.drop(["is_cns_molecule"],axis=1)
x1=x1.drop(["m_name"],axis=1)

y2=molecules_non_zero_df["is_cns_molecule"]
x2=molecules_non_zero_df.drop(["is_cns_molecule"],axis=1)
x2=x2.drop(["m_name"],axis=1)

# Model

In [5]:
clf = LinearSVC(random_state=0, tol=1e-5, dual=False)

# Print results function

In [6]:
def print_results(cv):
    keys=list(sorted(cv.keys()))
    for i in keys:
        print("Average: {}".format(round(cv[i].mean(),4)))
        print("{} -> {}".format(i,cv[i]),end="\n\n")
        

# Molecules_V1 original dataset with weigth zero features

In [7]:
x_train, x_test, y_train, y_test=train_test_split(x1,y1,random_state=0,test_size=0.2)
cv_results_v1 = cross_validate(clf, x1, y1, cv=10,scoring=('accuracy','average_precision',
                                                           'f1',))

In [8]:
print_results(cv_results_v1)

Average: 0.0129
fit_time -> [0.01595545 0.0119679  0.01396275 0.01199436 0.01196694 0.01296687
 0.01196694 0.01297045 0.0119679  0.0129652 ]

Average: 0.0035
score_time -> [0.00299215 0.00398946 0.0039897  0.0039897  0.00299215 0.00299072
 0.00398993 0.00298619 0.00398993 0.00299239]

Average: 0.8266
test_accuracy -> [0.81052632 0.84210526 0.83157895 0.82105263 0.81914894 0.82978723
 0.74193548 0.86021505 0.87096774 0.83870968]

Average: 0.7967
test_average_precision -> [0.79964809 0.8057368  0.75012269 0.68717177 0.79405783 0.76562297
 0.75245923 0.88246008 0.87505704 0.85507058]

Average: 0.7328
test_f1 -> [0.68965517 0.74576271 0.75       0.75362319 0.71186441 0.75
 0.6        0.77192982 0.79310345 0.76190476]



# Molecules_V2 alter dataset with non weigth zero features

In [9]:
x_train, x_test, y_train, y_test=train_test_split(x2,y2,random_state=0,test_size=0.2)
cv_results_v2 = cross_validate(clf, x2, y2, cv=10,scoring=('accuracy','average_precision',
                                                           'f1',))

In [10]:
print_results(cv_results_v2)

Average: 0.0099
fit_time -> [0.01299977 0.00997877 0.00897884 0.00895143 0.01097035 0.00897551
 0.00997305 0.0079782  0.00897574 0.01097035]

Average: 0.0035
score_time -> [0.00299311 0.00398302 0.00297904 0.00398946 0.00299215 0.00398946
 0.00299239 0.0039897  0.00299239 0.00398946]

Average: 0.8011
test_accuracy -> [0.77894737 0.81052632 0.83157895 0.81052632 0.76595745 0.76595745
 0.79569892 0.86021505 0.8172043  0.77419355]

Average: 0.7477
test_average_precision -> [0.69450193 0.8474754  0.64751731 0.66430227 0.77858618 0.68430192
 0.71700695 0.84592336 0.82715123 0.76987987]

Average: 0.6891
test_f1 -> [0.63157895 0.66666667 0.75       0.74285714 0.64516129 0.65625
 0.66666667 0.76363636 0.70175439 0.66666667]

