In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

In [3]:
X = df.drop('pIC50', axis=1)
X

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4615,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4616,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4617,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4618,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
Y = df.pIC50
Y

0       6.124938
1       6.999996
2       4.301030
3       6.522877
4       6.096909
          ...   
4615    6.943091
4616    9.677781
4617    6.562248
4618    4.118045
4619    4.259637
Name: pIC50, Length: 4620, dtype: float64

In [5]:
print(X.shape)
print(Y.shape)


(4620, 881)
(4620,)


**Removing Low Variance features**

In [6]:
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
X = selection.fit_transform(X)

In [7]:
X.shape

(4620, 140)

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print(X_train.shape)
print(X_test.shape)

(3696, 140)
(924, 140)


In [9]:
Y_train[0:5]

3358    4.525784
1501    4.162412
526     4.301030
2610    4.595166
2950    8.885723
Name: pIC50, dtype: float64

**Creating an ANN**

In [10]:
import keras
from keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(128, kernel_initializer='normal', input_shape=(140,), activation='relu'))

model.add(Dense(256,kernel_initializer='normal', activation='relu'))
model.add(Dense(256,kernel_initializer='normal', activation='relu'))
model.add(Dense(256,kernel_initializer='normal', activation='relu'))

model.add(Dense(1,kernel_initializer='normal', activation='linear'))

model.compile(loss='mean_squared_error', optimizer='adam',metrics=['mean_absolute_error'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               18048     
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 dense_3 (Dense)             (None, 256)               65792     
                                                                 
 dense_4 (Dense)             (None, 1)                 257       
                                                                 
Total params: 182,913
Trainable params: 182,913
Non-trainable params: 0
_________________________________________________________________


In [11]:
from keras.callbacks import ModelCheckpoint
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5'
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [12]:
model.fit(X_train, Y_train, epochs=500, batch_size=32, validation_data=(X_test,Y_test), callbacks=callbacks_list)

Epoch 1/500
Epoch 1: val_loss improved from inf to 2.20922, saving model to Weights-001--2.20922.hdf5
Epoch 2/500
Epoch 2: val_loss improved from 2.20922 to 2.05593, saving model to Weights-002--2.05593.hdf5
Epoch 3/500
Epoch 3: val_loss did not improve from 2.05593
Epoch 4/500
Epoch 4: val_loss improved from 2.05593 to 1.68636, saving model to Weights-004--1.68636.hdf5
Epoch 5/500
Epoch 5: val_loss did not improve from 1.68636
Epoch 6/500
Epoch 6: val_loss improved from 1.68636 to 1.57134, saving model to Weights-006--1.57134.hdf5
Epoch 7/500
Epoch 7: val_loss did not improve from 1.57134
Epoch 8/500
Epoch 8: val_loss improved from 1.57134 to 1.52916, saving model to Weights-008--1.52916.hdf5
Epoch 9/500
Epoch 9: val_loss improved from 1.52916 to 1.50724, saving model to Weights-009--1.50724.hdf5
Epoch 10/500
Epoch 10: val_loss improved from 1.50724 to 1.48056, saving model to Weights-010--1.48056.hdf5
Epoch 11/500
Epoch 11: val_loss did not improve from 1.48056
Epoch 12/500
Epoch 12:

<keras.callbacks.History at 0x79772ca7e260>

In [13]:
# Load wights file of the best model :
wights_file = '/content/Weights-025--1.39267.hdf5' # choose the best checkpoint
model.load_weights(wights_file) # load it

In [15]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

Y_test_pred = model.predict(X_test)

r2_score = r2_score(Y_test, Y_test_pred)
print("The accuracy of our model is {}%".format(round(r2_score, 2) *100))

The accuracy of our model is 53.0%
