In this notebook, we preprocess the data so it can be directly fed to an ML model.

In [428]:
import numpy as np
import pandas as pd

In [429]:
df = pd.read_csv('compiled_datasets/complete_nanotox_dataset.csv')
df

Unnamed: 0,Source,NP Type,Size,Shape,Surface area,Zeta potential,Hydrodynamic size,Molecular weight,Surface charge,Surface charge (categorical),...,QD Surface ligand,Cell origin (species),Cell origin (organ),Cell origin (anatomical),Cell origin (primary or cell-line),Cell name,Dose,Exposure time,Assay,Target
0,"Furxhi, 2020",CuO,40.0,,,-47.6,,,,,...,,Rat,,Endothelial,,BMEC,10.00,4.0,XTT,0.000
1,"Furxhi, 2020",CuO,60.0,,,-36.6,,,,,...,,Rat,,Endothelial,,BMEC,10.00,4.0,XTT,0.000
2,"Furxhi, 2020",CuO,40.0,,,-47.6,,,,,...,,Rat,,Endothelial,,BMEC,1.56,24.0,XTT,0.000
3,"Furxhi, 2020",CuO,60.0,,,-36.6,,,,,...,,Rat,,Endothelial,,BMEC,1.56,24.0,XTT,0.000
4,"Furxhi, 2020",CuO,40.0,,,-47.6,,,,,...,,Rat,,Endothelial,,BMEC,3.13,24.0,XTT,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7362,"Bilal, 2019",QD,4.9,,,,,,,Zwitterion,...,Aminoacid,Human,Breast,Epithelial,Cell-line,KPL-4,,48.0,MTT,0.426
7363,"Bilal, 2019",QD,5.5,,,,,,,Neutral,...,Lipid,Human,Cervix,Epithelial,Cell-line,HeLa,,24.0,MTT,0.720
7364,"Bilal, 2019",QD,5.5,,,,,,,Neutral,...,Lipid,Human,Cervix,Epithelial,Cell-line,HeLa,,48.0,MTT,0.680
7365,"Bilal, 2019",QD,5.5,,,,,,,Neutral,...,Lipid,Human,Cervix,Epithelial,Cell-line,HeLa,,24.0,MTT,0.730


In [430]:
# Our attributes are everything in the DF except the target and the source paper
X_pre = df.iloc[:, 1:-1]

# Our target is cell viability, which we binarize to "safe" (0) or "toxic" (1)
y = (df.values[:, -1] < 0.5).astype(int)

# We also keep track of the source documents
source = df.values[:, 0]

In [431]:
from sklearn.preprocessing import OneHotEncoder

# Here we encode the categorical features as one-hot features.
# For instance, if a variable has choices A, B, and C, we encode A as [1,0,0], B as [0,1,0], C as [0,0,1]

categorical_attrs_mask = df.dtypes[1:-1] == 'object'
X_cat = X_pre[X_pre.columns[categorical_attrs_mask]].copy()
X_cat = X_cat.fillna('missing')

oh = OneHotEncoder(
    sparse_output=False,
    drop='if_binary',
    handle_unknown='infrequent_if_exist',
    min_frequency=.01,
)
X_oh = oh.fit_transform(X_cat)
oh_columns = [i.replace('_sklearn', '') for i in oh.get_feature_names_out(oh.feature_names_in_)]
X_oh = pd.DataFrame(X_oh, columns=oh_columns)

X_oh

Unnamed: 0,NP Type_Ag,NP Type_Al2O3,NP Type_Au,NP Type_C56H103N9O39,NP Type_Cu2O,NP Type_CuO,NP Type_Fe3O4,NP Type_Pt,NP Type_QD,NP Type_SiO2,...,Assay_Live/Dead,Assay_MTS,Assay_MTT,Assay_NR,Assay_NRU,Assay_Resazurin,Assay_Trypan Blue,Assay_WST,Assay_missing,Assay_infrequent
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [432]:
# Here, we extract the numerical features, and add features to denote if their values are missing
# We then impute the numerical features with a value of 0.0

X_numeric = X_pre[X_pre.columns[~categorical_attrs_mask]].copy()
X_missing = pd.DataFrame(X_numeric.isna().values, columns=[f'{i}_missing' for i in X_numeric.columns])
X_numeric = X_numeric.fillna(0)

In [433]:
# Concatenate the different feature dataframes
df_full = pd.concat((X_numeric, X_missing, X_oh), axis=1)

# Add our target (cell viability) and paper source
df_full['Cell Viability'] = y
df_full['Source'] = source

# Reorder the columns for visualization
df_full = df_full[['Source', *df_full.columns[:-1]]]

# Drop duplicates based on everything but the source
df_full = df_full.drop_duplicates(subset=df_full.columns[1:])
df_full

Unnamed: 0,Source,Size,Surface area,Zeta potential,Hydrodynamic size,Molecular weight,Surface charge,Electronegativity,Ionic radius,QD Concentration,...,Assay_MTS,Assay_MTT,Assay_NR,Assay_NRU,Assay_Resazurin,Assay_Trypan Blue,Assay_WST,Assay_missing,Assay_infrequent,Cell Viability
0,"Furxhi, 2020",40.0,0.0,-47.6,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,"Furxhi, 2020",60.0,0.0,-36.6,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
2,"Furxhi, 2020",40.0,0.0,-47.6,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
3,"Furxhi, 2020",60.0,0.0,-36.6,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,"Furxhi, 2020",40.0,0.0,-47.6,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7360,"Bilal, 2019",5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5600.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7361,"Bilal, 2019",4.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7362,"Bilal, 2019",4.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7363,"Bilal, 2019",5.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [434]:
# Save to file
df_full.to_csv('compiled_datasets/nanotox_features_dataset.csv', index=False)