In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

**Load data (tox21)**

In [2]:
loaded_tox21_features_train_val_test = np.load('../../preprocessing/preprocessed_data/tox21_features_train_val_test.npy', allow_pickle=True).item()

tox21_X_train = loaded_tox21_features_train_val_test['train']
tox21_X_val = loaded_tox21_features_train_val_test['validation']
tox21_X_test = loaded_tox21_features_train_val_test['test']

loaded_tox21_labels_train_val_test = np.load('../../preprocessing/preprocessed_data/tox21_labels_train_val_test.npy', allow_pickle=True).item()

tox21_y_train = loaded_tox21_labels_train_val_test['train']
tox21_y_val = loaded_tox21_labels_train_val_test['validation']
tox21_y_test = loaded_tox21_labels_train_val_test['test']

In [3]:
print('Features Train/Val/Test Shapes:')
[i.shape for i in [tox21_X_train, tox21_X_val, tox21_X_test]]

Features Train/Val/Test Shapes:


[(4698, 2248), (1566, 2248), (1567, 2248)]

In [4]:
print('Labels Train/Val/Test Shapes:')
[i.shape for i in [tox21_y_train, tox21_y_val, tox21_y_test]]

Labels Train/Val/Test Shapes:


[(4698, 12), (1566, 12), (1567, 12)]

**Create 12 datasets, each for separate task, delete the rows, where the label is -1**

In [5]:
def create_one_label_datasets(X_train, X_val, X_test, y_train, y_val, y_test):
    datasets = []
    
    X_datasets = [X_train, X_val, X_test]
    y_datasets = [y_train, y_val, y_test]
    
    for x_dataset, y_dataset in zip(X_datasets, y_datasets):
        partitioned_datasets = []
        for i in range(y_dataset.shape[-1]):
            labels = y_dataset[:, i].reshape(-1, 1)
            features_labels = np.concatenate((x_dataset, labels), axis=1)
            cleared_dataset = features_labels[features_labels[:, -1] != -1]
            partitioned_datasets.append(cleared_dataset)
        datasets.append(partitioned_datasets)
    return datasets

In [6]:
train_datasets, val_datasets, test_datasets = create_one_label_datasets(tox21_X_train, 
                                                                        tox21_X_val, 
                                                                        tox21_X_test, 
                                                                        tox21_y_train, 
                                                                        tox21_y_val, 
                                                                        tox21_y_test)

**Check shapes**

In [7]:
[i.shape for i in train_datasets], len(train_datasets)

([(4380, 2249),
  (4047, 2249),
  (3933, 2249),
  (3500, 2249),
  (3738, 2249),
  (4189, 2249),
  (3886, 2249),
  (3484, 2249),
  (4243, 2249),
  (3875, 2249),
  (3488, 2249),
  (4053, 2249)],
 12)

In [8]:
[i.shape for i in val_datasets], len(val_datasets)

([(1446, 2249),
  (1347, 2249),
  (1309, 2249),
  (1146, 2249),
  (1210, 2249),
  (1380, 2249),
  (1270, 2249),
  (1187, 2249),
  (1416, 2249),
  (1311, 2249),
  (1146, 2249),
  (1350, 2249)],
 12)

In [9]:
[i.shape for i in test_datasets], len(test_datasets)

([(1439, 2249),
  (1364, 2249),
  (1307, 2249),
  (1175, 2249),
  (1245, 2249),
  (1386, 2249),
  (1294, 2249),
  (1161, 2249),
  (1413, 2249),
  (1281, 2249),
  (1176, 2249),
  (1371, 2249)],
 12)

**Train RF models on the train datasets and make predictions on val and test datasets**

In [10]:
def make_preds(train_datasets, val_datasets, test_datasets):
    roc_auc_scores = []
    
    for i in range(len(train_datasets)):
        rf_classifier = RandomForestClassifier(n_estimators=1001, random_state=42, n_jobs=-1)
        rf_classifier.fit(train_datasets[i][:, :-1], train_datasets[i][:, -1])
        
        preds_val = rf_classifier.predict_proba(val_datasets[i][:, :-1])
        preds_test = rf_classifier.predict_proba(test_datasets[i][:, :-1])
        
        roc_auc_score_val = roc_auc_score(val_datasets[i][:, -1], preds_val[:, 1])
        roc_auc_score_test = roc_auc_score(test_datasets[i][:, -1], preds_test[:, 1])
        
        roc_auc_scores.append([roc_auc_score_val, roc_auc_score_test])
    
    return np.array(roc_auc_scores)

In [11]:
%%time
roc_auc_scores = make_preds(train_datasets, val_datasets, test_datasets)

CPU times: total: 19min 3s
Wall time: 2min 26s


**Create summaries for roc_auc_scores and their means and stds**

In [13]:
tox21 = pd.read_csv('../../preprocessing/datasets/tox21.csv')

In [14]:
summary = pd.DataFrame(data=roc_auc_scores.T, columns=tox21.columns[:-2], index=['val_scores', 'test_scores'])

In [15]:
summary

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
val_scores,0.831641,0.845743,0.906419,0.873007,0.746943,0.866117,0.893504,0.849144,0.893689,0.790534,0.910768,0.867046
test_scores,0.808014,0.875882,0.932555,0.845152,0.777029,0.887799,0.829793,0.813034,0.885738,0.77859,0.92358,0.861492


In [16]:
mean_std_val_test = np.array([[np.mean(roc_auc_scores[i]), np.std(roc_auc_scores[i])] for i in range(2)])

In [17]:
summary2 = pd.DataFrame(data=mean_std_val_test, columns=['mean', 'std'], index=['val', 'test'])

In [18]:
summary2

Unnamed: 0,mean,std
val,0.819827,0.011814
test,0.860812,0.01507
