In [None]:
from joblib import dump, load

In [None]:
x_train_mol = load('x_train_mol.pkl')
x_test_mol = load('x_test_mol.pkl')
x_valid_mol = load('x_valid_mol.pkl')

In [None]:
y_train = load('y_train.pkl')
y_test = load('y_test.pkl')
y_valid = load('y_valid.pkl')

In [None]:
from rdkit import Chem
from rdkit.Chem import DataStructs
import numpy as np

In [None]:
fp1 = [Chem.RDKFingerprint(m1) for m1 in x_train_mol]

In [None]:
fp2 = [Chem.RDKFingerprint(m2) for m2 in x_test_mol]

In [None]:
fp3 = [Chem.RDKFingerprint(m3) for m3 in x_valid_mol]

In [None]:
def rdkit_numpy_convert(fp):
    output = []
    for f in fp:
        arr = np.zeros([1,])
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [None]:
x_train = rdkit_numpy_convert(fp1)

In [None]:
x_test = rdkit_numpy_convert(fp2)

In [None]:
x_valid = rdkit_numpy_convert(fp3)

In [None]:
dump(x_valid, 'x_valid_rdkit.pkl', compress=3)

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [None]:
seed = 42

In [None]:
cv = StratifiedKFold(n_splits=10, random_state=seed)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scale = StandardScaler().fit(x_train)
x_train_scaled = scale.transform(x_train)
x_test_scaled = scale.transform(x_test)
x_valid_scaled = scale.transform(x_valid)

In [None]:
from sklearn.svm import SVC

In [None]:
param_grid = {'C': np.arange(10, 50, 1),
              'gamma': np.arange(0.00001, 0.001, 0.00001)}

In [None]:
svm = GridSearchCV(SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=seed), param_grid=param_grid, cv=cv, n_jobs=10, verbose=1)

In [None]:
svm.fit(x_train_scaled, y_train)

In [None]:
svm.best_params_

In [None]:
svm.best_score_

In [None]:
svm.cv_results_['mean_test_score']

In [None]:
svm.cv_results_['params']

In [None]:
svm_2c8 = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=seed, C=36, gamma=0.0001)

In [None]:
svm_2c8.fit(x_train_scaled, y_train)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import matthews_corrcoef, accuracy_score, roc_auc_score, make_scorer, recall_score

In [None]:
matthews = make_scorer(matthews_corrcoef)
specificity = make_scorer(recall_score, pos_label=0)

In [None]:
cross_val_score(svm_2c8, x_train_scaled, y_train, cv=cv, scoring='accuracy').mean()

In [None]:
cross_val_score(svm_2c8, x_train_scaled, y_train, cv=cv, scoring='roc_auc').mean()

In [None]:
cross_val_score(svm_2c8, x_train_scaled, y_train, cv=cv, scoring=matthews).mean()

In [None]:
cross_val_score(svm_2c8, x_train_scaled, y_train, cv=cv, scoring='recall').mean()

In [None]:
cross_val_score(svm_2c8, x_train_scaled, y_train, cv=cv, scoring=specificity).mean()

In [None]:
y_pred = svm_2c8.predict(x_test_scaled)

In [None]:
y_pred

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
roc_auc_score(y_test, svm_2c8.decision_function(x_test_scaled))

In [None]:
matthews_corrcoef(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred, pos_label=0)

In [None]:
import matplotlib.pyplot as plt

In [None]:
training_acc = []
test_acc = []

for i in np.arange(0.00005, 0.00016, 0.00001):
    svm_2c8 = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=seed, C=36, gamma=i)
    svm_2c8.fit(x_train_scaled, y_train)
    training_acc.append(cross_val_score(svm_2c8, x_train_scaled, y_train, cv=cv, scoring='accuracy').mean())
    y_pred = svm_2c8.predict(x_test_scaled)
    test_acc.append(accuracy_score(y_test, y_pred))
print(max(training_acc),(training_acc.index(max(training_acc)))+1) # index starts from 0
print(max(test_acc),(test_acc.index(max(test_acc)))+1)
plt.figure(figsize=(20, 10))
plt.plot(np.arange(0.00005, 0.00016, 0.00001), training_acc, label="training_acc")
plt.plot(np.arange(0.00005, 0.00016, 0.00001), test_acc, label="test_acc")
plt.xlabel("C")
plt.ylabel("Accuracy")
xticks = np.arange(0.00005, 0.00016, 0.00001)
plt.xticks(xticks)
plt.legend()

In [None]:
training_acc = []
test_acc = []

for i in np.arange(1, 50, 1):
    svm_2c8 = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=seed, C=i, gamma=0.0001)
    svm_2c8.fit(x_train_scaled, y_train)
    training_acc.append(cross_val_score(svm_2c8, x_train_scaled, 
                                        y_train, cv=cv, scoring='accuracy').mean())
    y_pred = svm_2c8.predict(x_test_scaled)
    test_acc.append(accuracy_score(y_test, y_pred))
print(max(training_acc),(training_acc.index(max(training_acc)))+1) # index starts from 0
print(max(test_acc),(test_acc.index(max(test_acc)))+1)
plt.figure(figsize=(20, 10))
plt.plot(np.arange(1, 50, 1), training_acc, label="training_acc")
plt.plot(np.arange(1, 50, 1), test_acc, label="test_acc")
plt.xlabel("C")
plt.ylabel("Accuracy")
xticks = np.arange(1, 50, 1)
plt.xticks(xticks)
plt.legend()

In [None]:
y_pred_valid = svm_2c8.predict(x_valid_scaled)

In [None]:
accuracy_score(y_valid, y_pred_valid)

In [None]:
roc_auc_score(y_valid, svm_2c8.decision_function(x_valid_scaled))

In [None]:
matthews_corrcoef(y_valid, y_pred_valid)

In [None]:
recall_score(y_valid, y_pred_valid)

In [None]:
recall_score(y_valid, y_pred_valid, pos_label=0)

In [None]:
dump(svm_2c8, '2c8_rdkit_svm.pkl', compress=3)