# Predict Acute Oral Systemic Toxicity                   Nima Vahdat 610397163

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import RandomForestClassifier as RForest
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
import warnings
warnings.filterwarnings("ignore")


index_ = []
accuracy = []
fold = []

Here we import our dataset and prepare it to be a useable dataset.

In [2]:
data=pd.read_csv('qsar_oral_toxicity.csv')
x = []
y = []

datalist = data.values.tolist()
for i in datalist:
    for j in i:
        x.append(j.split(';')[:-1])
        y.append(j.split(';')[-1])


Using PCA to reduce the dimensionality to "512" components.


In [3]:
pca = PCA(n_components=512)
principalComponents = pca.fit_transform(x)

### Encoding categorial data

In [4]:
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

### Splitting the dataset
   The dataset divided into a calibration set (75%) and validation set
(25%) and calibration were further divided into a training and a test set, 
containing respectively 75% and 25% of the molecules included in the calibration set

In [5]:
from sklearn.model_selection import train_test_split
x_calibration, x_validation, y_calibration, y_validation = train_test_split(principalComponents, y,\
                                                                            test_size = 0.25, random_state = 0)
x_train, x_test, y_train, y_test = train_test_split(x_calibration, y_calibration, test_size = 0.25, random_state = 0)

### Feature Scaling

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_calibration = sc.fit_transform(x_calibration)
x_validation = sc.transform(x_validation)

### Defining score function

In [7]:
def get_score(clf, x_train, x_test, y_train, y_test, x_validation, y_validation):
    global index_, accuracy
    clf.fit(x_train, y_train)
    y_pred_test = clf.predict(x_test)
    y_pred_valid = clf.predict(x_validation)
    index_.append(clf.__class__.__name__)
    accuracy_ = accuracy_score(y_test, y_pred_test) 
    accuracy.append(accuracy_)
    
    print("\n========"+clf.__class__.__name__+" classifying results=======")
    print("\n ON TEST")
    print("Confusion Matrix of test : ")
    print(confusion_matrix(y_test, y_pred_test))
    print("Accuracy on test:",accuracy_)
    print("Classification report of test:")
    print(classification_report(y_test, y_pred_test), "\n")
    print("\n ON VALIDATION")
    print("Confusion Matrix of validation : ")
    print(confusion_matrix(y_validation, y_pred_valid))
    print("Accuracy on validation:",accuracy_score(y_validation, y_pred_valid))
    print("Classification report of validation:")
    print(classification_report(y_validation, y_pred_valid))
    return

### Determinig Score using 10-fold crossvalidation

In [8]:
def score_fold(clf, x, y):
    global fold
    l = cross_val_score(clf, x, y, cv = 10)
    score = 0
    for i in l:
        score += i
    final = score/len(l)
    print(final)
    fold.append(final)
    return

# Classifying with KNN
   * Validation in paper

In [9]:
get_score(KNN(n_neighbors = 3), x_train, x_test, y_train, y_test, x_validation, y_validation)



 ON TEST
Confusion Matrix of test : 
[[1502   38]
 [  82   64]]
Accuracy on test: 0.9288256227758007
Classification report of test:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1540
           1       0.63      0.44      0.52       146

   micro avg       0.93      0.93      0.93      1686
   macro avg       0.79      0.71      0.74      1686
weighted avg       0.92      0.93      0.92      1686
 


 ON VALIDATION
Confusion Matrix of validation : 
[[2023   55]
 [  93   77]]
Accuracy on validation: 0.9341637010676157
Classification report of validation:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      2078
           1       0.58      0.45      0.51       170

   micro avg       0.93      0.93      0.93      2248
   macro avg       0.77      0.71      0.74      2248
weighted avg       0.93      0.93      0.93      2248



   * 10-fold crossvalidation
       (The result is showing the avg-Score.)

In [10]:
score_fold(KNN(n_neighbors = 3), pd.DataFrame(x), np.ravel(pd.DataFrame(y), order='C'))

0.9295963416141392


# Classifying with Random Forest
   * Validation in paper

In [11]:
get_score(RForest(max_depth = 300), x_train, x_test, y_train, y_test, x_validation, y_validation)



 ON TEST
Confusion Matrix of test : 
[[1530   10]
 [ 128   18]]
Accuracy on test: 0.9181494661921709
Classification report of test:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96      1540
           1       0.64      0.12      0.21       146

   micro avg       0.92      0.92      0.92      1686
   macro avg       0.78      0.56      0.58      1686
weighted avg       0.90      0.92      0.89      1686
 


 ON VALIDATION
Confusion Matrix of validation : 
[[ 942 1136]
 [  78   92]]
Accuracy on validation: 0.4599644128113879
Classification report of validation:
              precision    recall  f1-score   support

           0       0.92      0.45      0.61      2078
           1       0.07      0.54      0.13       170

   micro avg       0.46      0.46      0.46      2248
   macro avg       0.50      0.50      0.37      2248
weighted avg       0.86      0.46      0.57      2248



   * 10-fold crossvalidation
        (The result is showing the avg-Score.)

In [12]:
score_fold(RForest(max_depth = 300), pd.DataFrame(x), np.ravel(pd.DataFrame(y), order='C'))

0.9342680756395995


# Classifying with Gradient Boosting
   * Validation in paper

In [13]:
get_score(GradientBoostingClassifier(random_state = 0), x_train, x_test, y_train, y_test, x_validation, y_validation)



 ON TEST
Confusion Matrix of test : 
[[1526   14]
 [ 114   32]]
Accuracy on test: 0.9240806642941874
Classification report of test:
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      1540
           1       0.70      0.22      0.33       146

   micro avg       0.92      0.92      0.92      1686
   macro avg       0.81      0.61      0.65      1686
weighted avg       0.91      0.92      0.91      1686
 


 ON VALIDATION
Confusion Matrix of validation : 
[[ 109 1969]
 [   4  166]]
Accuracy on validation: 0.12233096085409252
Classification report of validation:
              precision    recall  f1-score   support

           0       0.96      0.05      0.10      2078
           1       0.08      0.98      0.14       170

   micro avg       0.12      0.12      0.12      2248
   macro avg       0.52      0.51      0.12      2248
weighted avg       0.90      0.12      0.10      2248



   * 10-fold crossvalidation
        (The result is showing the avg-Score.)

In [14]:
score_fold(GradientBoostingClassifier(random_state = 0), pd.DataFrame(x), np.ravel(pd.DataFrame(y), order='C'))

0.9318209121245827


# Classifying with Ada Boosting
   * Validation in paper

In [15]:
get_score(AdaBoostClassifier(n_estimators=100, random_state=0), x_train, x_test, y_train, y_test, x_validation, y_validation)



 ON TEST
Confusion Matrix of test : 
[[1512   28]
 [ 109   37]]
Accuracy on test: 0.9187425860023725
Classification report of test:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96      1540
           1       0.57      0.25      0.35       146

   micro avg       0.92      0.92      0.92      1686
   macro avg       0.75      0.62      0.65      1686
weighted avg       0.90      0.92      0.90      1686
 


 ON VALIDATION
Confusion Matrix of validation : 
[[1887  191]
 [ 127   43]]
Accuracy on validation: 0.858540925266904
Classification report of validation:
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      2078
           1       0.18      0.25      0.21       170

   micro avg       0.86      0.86      0.86      2248
   macro avg       0.56      0.58      0.57      2248
weighted avg       0.88      0.86      0.87      2248



   * 10-fold crossvalidation
        (The result is showing the avg-Score.)

In [16]:
score_fold(AdaBoostClassifier(n_estimators=100, random_state=0), pd.DataFrame(x), np.ravel(pd.DataFrame(y), order='C'))

0.92125361512792


# Classifying with MLP
   * Validation in paper

In [17]:
get_score(MLPClassifier(random_state=1, max_iter=300), x_train, x_test, y_train, y_test, x_validation, y_validation)



 ON TEST
Confusion Matrix of test : 
[[1502   38]
 [  82   64]]
Accuracy on test: 0.9288256227758007
Classification report of test:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1540
           1       0.63      0.44      0.52       146

   micro avg       0.93      0.93      0.93      1686
   macro avg       0.79      0.71      0.74      1686
weighted avg       0.92      0.93      0.92      1686
 


 ON VALIDATION
Confusion Matrix of validation : 
[[1768  310]
 [  87   83]]
Accuracy on validation: 0.8233985765124555
Classification report of validation:
              precision    recall  f1-score   support

           0       0.95      0.85      0.90      2078
           1       0.21      0.49      0.29       170

   micro avg       0.82      0.82      0.82      2248
   macro avg       0.58      0.67      0.60      2248
weighted avg       0.90      0.82      0.85      2248



   * 10-fold crossvalidation
        (The result is showing the avg-Score.)

In [18]:
score_fold(MLPClassifier(random_state=1, max_iter=300), pd.DataFrame(x), np.ravel(pd.DataFrame(y), order='C'))

0.9356020269435176


# Classifying with NB
   * Validation in paper

In [19]:
get_score(GaussianNB(), x_train, x_test, y_train, y_test, x_validation, y_validation)



 ON TEST
Confusion Matrix of test : 
[[1489   51]
 [  83   63]]
Accuracy on test: 0.9205219454329775
Classification report of test:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1540
           1       0.55      0.43      0.48       146

   micro avg       0.92      0.92      0.92      1686
   macro avg       0.75      0.70      0.72      1686
weighted avg       0.91      0.92      0.92      1686
 


 ON VALIDATION
Confusion Matrix of validation : 
[[ 702 1376]
 [  29  141]]
Accuracy on validation: 0.375
Classification report of validation:
              precision    recall  f1-score   support

           0       0.96      0.34      0.50      2078
           1       0.09      0.83      0.17       170

   micro avg       0.38      0.38      0.38      2248
   macro avg       0.53      0.58      0.33      2248
weighted avg       0.89      0.38      0.47      2248



   * 10-fold crossvalidation
        (The result is showing the avg-Score.)

In [20]:
score_fold(GaussianNB(), pd.DataFrame(x), np.ravel(pd.DataFrame(y), order='C'))

0.7574273884563094


# Clustering the dataset

In [21]:
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2)
kmeans.fit(x)
print(kmeans)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)


In [22]:
import scipy
def find_permutation(n_clusters, real_labels, labels):
    permutation=[]
    for i in range(n_clusters):
        idx = labels == i
        new_label=scipy.stats.mode(real_labels[idx])[0][0]
        permutation.append(new_label)
    return permutation

In [23]:
permutation = find_permutation(2, y, kmeans.labels_)

In [24]:
new_labels = [ permutation[label] for label in kmeans.labels_]
print("Accuracy score is", accuracy_score(y, new_labels))

Accuracy score is 0.9175842509175842


# Classifications summary result

In [26]:
data = {'Accuracy':accuracy,'10-fold crossvalidation':fold}
re = pd.DataFrame(data, index = index_)
print(re)

                            Accuracy  10-fold crossvalidation
KNeighborsClassifier        0.928826                 0.929596
RandomForestClassifier      0.918149                 0.934268
GradientBoostingClassifier  0.924081                 0.931821
AdaBoostClassifier          0.918743                 0.921254
MLPClassifier               0.928826                 0.935602
GaussianNB                  0.920522                 0.757427
