# EXPLORATORY DATA ANALYSIS

## Packages and Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend as K

2024-11-09 21:45:10.262209: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-09 21:45:10.536456: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-09 21:45:10.636244: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-09 21:45:10.661112: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-09 21:45:10.837200: I tensorflow/core/platform/cpu_feature_guar

## Data Analysis

In [2]:
# Train and test data from Tox21 dataset
x_train = pd.read_csv('data/tox21_dense_train.csv')
y_train = pd.read_csv('data/tox21_labels_train.csv')

x_test = pd.read_csv('data/tox21_dense_test.csv')
y_test = pd.read_csv('data/tox21_labels_test.csv')

In [3]:
x_train.head()

Unnamed: 0.1,Unnamed: 0,AW,AWeight,Arto,BertzCT,Chi0,Chi1,Chi10,Chi2,Chi3,...,W3D,W3DH,WNSA1,WNSA2,WNSA3,WPSA1,WPSA2,WPSA3,grav,rygr
0,NCGC00178831-03,54367200.0,13.053,2.176,3.194,23.112,15.868,1.496,15.127,12.592,...,2687.469,9241.018,115.371,-915.496,-39.983,290.078,2301.941,59.492,88.147,3.708
1,NCGC00166114-03,12688180.0,22.123,2.065,3.137,21.033,13.718,1.937,13.187,11.951,...,2184.384,3234.199,194.74,-1029.609,-34.205,235.36,1244.323,82.906,134.852,4.131
2,NCGC00263563-01,3076932.0,13.085,2.154,3.207,46.896,29.958,3.806,30.105,25.569,...,13803.524,76582.899,238.004,-4358.946,-106.537,868.685,15909.444,135.335,216.852,5.075
3,NCGC00013058-02,71685690.0,12.832,2.029,3.38,51.086,32.045,1.806,29.09,21.603,...,13807.345,50498.175,226.312,-2785.555,-61.923,763.288,9394.859,125.509,238.265,4.64
4,NCGC00167516-01,7989702.0,12.936,2.124,3.573,70.295,46.402,3.604,42.132,32.57,...,43231.286,163659.229,850.869,-21136.699,-367.122,1798.703,44681.209,362.168,317.901,7.845


In [4]:
# Check out the data
x_train.head()

print(f"(n_samples, d_features): {x_train.shape}")
print(f"{x_train.columns}")

(n_samples, d_features): (12060, 802)
Index(['Unnamed: 0', 'AW', 'AWeight', 'Arto', 'BertzCT', 'Chi0', 'Chi1',
       'Chi10', 'Chi2', 'Chi3',
       ...
       'W3D', 'W3DH', 'WNSA1', 'WNSA2', 'WNSA3', 'WPSA1', 'WPSA2', 'WPSA3',
       'grav', 'rygr'],
      dtype='object', length=802)


In [5]:
y_train.head()

print(f"(n_samples, m_tasks): {y_train.shape}")
print(f"{y_train.columns}")

(n_samples, m_tasks): (12060, 13)
Index(['Unnamed: 0', 'NR.AhR', 'NR.AR', 'NR.AR.LBD', 'NR.Aromatase', 'NR.ER',
       'NR.ER.LBD', 'NR.PPAR.gamma', 'SR.ARE', 'SR.ATAD5', 'SR.HSE', 'SR.MMP',
       'SR.p53'],
      dtype='object')


In [6]:
# Check missing values
print(f"Missing values in x_train: {x_train.isnull().sum().sum()}")
print(f"Missing values in y_train: {y_train.isnull().sum().sum()}")

# Check for missing values in test data
print(f"Missing values in x_test: {x_test.isnull().sum().sum()}")
print(f"Missing values in y_test: {y_test.isnull().sum().sum()}")

Missing values in x_train: 0
Missing values in y_train: 43238
Missing values in x_test: 0
Missing values in y_test: 791


In [7]:
x_train.describe()

Unnamed: 0,AW,AWeight,Arto,BertzCT,Chi0,Chi1,Chi10,Chi2,Chi3,Chi3c,...,W3D,W3DH,WNSA1,WNSA2,WNSA3,WPSA1,WPSA2,WPSA3,grav,rygr
count,12060.0,12060.0,12060.0,12060.0,12060.0,12060.0,12060.0,12060.0,12060.0,12060.0,...,12060.0,12060.0,12060.0,12060.0,12060.0,12060.0,12060.0,12060.0,12060.0,12060.0
mean,4219191.0,14.510179,2.011927,2.546282,14.001534,9.01234,0.522415,8.201283,6.30759,1.600608,...,1248.363297,5293.483884,85.80564,-555.791432,-30.399123,186.128018,1238.731087,34.164007,123.783499,3.219006
std,11756690.0,4.787198,0.195923,0.435071,8.091058,5.32414,0.826746,5.153103,4.387759,1.33167,...,3180.775783,12823.738015,72.904885,1918.257248,36.25295,165.55181,3232.420359,33.984633,3842.918796,0.994193
min,1.0,11.366,0.0,0.301,0.0,0.0,0.0,0.0,0.0,0.0,...,1.206,1.688,0.0,-120941.468,-1235.074,0.0,0.0,0.0,1.594,0.162
25%,3.077,12.723,1.941,2.323,8.552,5.343,0.0,4.767,3.288,0.731,...,203.31675,915.80525,43.3005,-544.34675,-36.667,89.507,236.48425,16.7365,21.649,2.53
50%,4.199,13.342,2.0,2.631,12.466,8.041,0.217,7.223,5.439,1.28,...,550.826,2236.3115,69.9675,-261.439,-22.159,146.22,561.24,26.9115,36.3195,3.142
75%,6.609,14.583,2.138,2.853,17.22525,11.30725,0.706,10.4265,8.4335,2.07,...,1290.36925,5515.172,107.56775,-112.98925,-13.2485,234.6165,1259.67875,41.569,57.43725,3.746
max,100000000.0,151.466,2.667,3.776,94.695,63.006,12.188,59.172,50.798,19.852,...,93581.972,355470.488,1366.65,0.0,0.0,3353.749,101675.928,1391.785,397154.56,15.511


## Feature Scaling

In [8]:
# Standard Deviation
std_scaler = StandardScaler()

# Fit the scaler on the training data
std_scaler.fit_transform(x_train.iloc[:, 1:])

print(x_train.describe())

                 AW       AWeight          Arto       BertzCT          Chi0  \
count  1.206000e+04  12060.000000  12060.000000  12060.000000  12060.000000   
mean   4.219191e+06     14.510179      2.011927      2.546282     14.001534   
std    1.175669e+07      4.787198      0.195923      0.435071      8.091058   
min    1.000000e+00     11.366000      0.000000      0.301000      0.000000   
25%    3.077000e+00     12.723000      1.941000      2.323000      8.552000   
50%    4.199000e+00     13.342000      2.000000      2.631000     12.466000   
75%    6.609000e+00     14.583000      2.138000      2.853000     17.225250   
max    1.000000e+08    151.466000      2.667000      3.776000     94.695000   

              Chi1         Chi10          Chi2          Chi3         Chi3c  \
count  12060.00000  12060.000000  12060.000000  12060.000000  12060.000000   
mean       9.01234      0.522415      8.201283      6.307590      1.600608   
std        5.32414      0.826746      5.153103      4.

In [9]:
x_train.head()

Unnamed: 0.1,Unnamed: 0,AW,AWeight,Arto,BertzCT,Chi0,Chi1,Chi10,Chi2,Chi3,...,W3D,W3DH,WNSA1,WNSA2,WNSA3,WPSA1,WPSA2,WPSA3,grav,rygr
0,NCGC00178831-03,54367200.0,13.053,2.176,3.194,23.112,15.868,1.496,15.127,12.592,...,2687.469,9241.018,115.371,-915.496,-39.983,290.078,2301.941,59.492,88.147,3.708
1,NCGC00166114-03,12688180.0,22.123,2.065,3.137,21.033,13.718,1.937,13.187,11.951,...,2184.384,3234.199,194.74,-1029.609,-34.205,235.36,1244.323,82.906,134.852,4.131
2,NCGC00263563-01,3076932.0,13.085,2.154,3.207,46.896,29.958,3.806,30.105,25.569,...,13803.524,76582.899,238.004,-4358.946,-106.537,868.685,15909.444,135.335,216.852,5.075
3,NCGC00013058-02,71685690.0,12.832,2.029,3.38,51.086,32.045,1.806,29.09,21.603,...,13807.345,50498.175,226.312,-2785.555,-61.923,763.288,9394.859,125.509,238.265,4.64
4,NCGC00167516-01,7989702.0,12.936,2.124,3.573,70.295,46.402,3.604,42.132,32.57,...,43231.286,163659.229,850.869,-21136.699,-367.122,1798.703,44681.209,362.168,317.901,7.845
