## Development of a Machine Learning Model to Predict the Cytotoxicity of Nanoparticles in Cell Cultures

## 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Preliminary adjustments
### 2.1. Info about dataset

In [2]:
# Path to the dataset
data_path = "C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB.csv"
data = pd.read_csv(data_path)

# Function to display detailed information about the dataset
def dataset_detailed_overview(dataframe):
    # Calculate the percentage of missing values for each column
    missing_percentage = dataframe.isnull().mean() * 100
    
    # Print header
    print("Column Name - Missing Values (%) - Data Type")
    
    # Loop through each column and print the desired information
    for col in dataframe.columns:
        print(f"{col} - {missing_percentage[col]:.2f}% - {dataframe[col].dtype}")

# Call the function with your dataset
dataset_detailed_overview(data)

Column Name - Missing Values (%) - Data Type
Unnamed: 0 - 0.00% - int64
material - 0.00% - object
shape - 49.19% - object
coat/functional group - 69.00% - object
synthesismethod - 49.19% - object
surface charge - 47.60% - object
size in medium (nm) - 73.96% - float64
zeta in medium (mV) - 79.97% - float64
no of cells (cells/well) - 53.89% - float64
human/animal - 16.15% - object
cell source - 7.48% - object
cell tissue - 7.48% - object
cell morphology - 16.15% - object
cell age - 16.15% - object
time (hr) - 7.48% - float64
concentration (ug/ml) - 36.07% - float64
test - 7.48% - object
test indicator - 16.15% - object
viability (%) - 0.00% - float64
DOI - 16.15% - object
core size (nm) - 83.85% - float64
surface area - 83.85% - float64
Hydrodynamic diameter (nm) - 2.23% - float64
Zeta potential (mV) - 41.96% - float64
Cell type - 7.48% - object
Molecular weight (g/mol) - 16.15% - float64


### 2.2. Dropping Unnecessary colomns

In [3]:
# Drop the unnecessary columns
data_cleaned = data.drop(['Unnamed: 0', 'DOI'], axis=1)

# Define the path for the new CSV file
new_data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'

# Save the cleaned dataset to a new CSV file
data_cleaned.to_csv(new_data_path, index=False)

print(f"Cleaned dataset saved to {new_data_path}")

Cleaned dataset saved to C:\Users\tikli\Desktop\NanoToxML\Tox_DB_Cleaned.csv


In [4]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'
data = pd.read_csv(data_path)

# Get all unique variations in the 'cell source' column
unique_cell_sources = data['cell source'].unique()

# Print all unique cell sources
print("Unique Cell Sources:")
for source in unique_cell_sources:
    print(source)

Unique Cell Sources:
Rat
Mouse
Monkey
Dog
Human
Porcine
Rabbit
Catfish
Hamster
Pig
Monkey (Cercopithecus aethiops)
Canine
hamster
nan


In [5]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'
data = pd.read_csv(data_path)

# Check for missing values in both columns
missing_human_animal = data['human/animal'].isnull()
missing_cell_source = data['cell source'].isnull()

# Calculate the scenarios
both_missing = data[missing_human_animal & missing_cell_source].shape[0]
only_human_animal_missing = data[missing_human_animal & ~missing_cell_source].shape[0]
only_cell_source_missing = data[~missing_human_animal & missing_cell_source].shape[0]

# Print the results
print(f"Rows where both 'human/animal' and 'cell source' are missing: {both_missing}")
print(f"Rows where only 'human/animal' is missing and 'cell source' is available: {only_human_animal_missing}")
print(f"Rows where only 'cell source' is missing and 'human/animal' is available: {only_cell_source_missing}")

Rows where both 'human/animal' and 'cell source' are missing: 494
Rows where only 'human/animal' is missing and 'cell source' is available: 572
Rows where only 'cell source' is missing and 'human/animal' is available: 0


In [6]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'
data = pd.read_csv(data_path)

# Drop the 'human/animal' column
data_dropped = data.drop('human/animal', axis=1)

# Define the path for the new CSV file
new_data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'

# Save the updated dataset to a new CSV file
data_dropped.to_csv(new_data_path, index=False)

print(f"Updated dataset saved to {new_data_path}")

Updated dataset saved to C:\Users\tikli\Desktop\NanoToxML\Tox_DB_Updated.csv


### 2.3. Dropping colomns with more than 20% missing values and less meaningful ones

In [7]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'
data = pd.read_csv(data_path)

# Calculate the percentage of missing values for each column
missing_percentages = data.isnull().mean() * 100

# Identify and drop columns with more than 20% missing values
columns_to_drop = missing_percentages[missing_percentages > 20].index
data_cleaned = data.drop(columns=columns_to_drop)

# Overwrite the original dataset with the cleaned data
data_cleaned.to_csv(data_path, index=False)

print("The dataset has been updated and overwritten at", data_path)

The dataset has been updated and overwritten at C:\Users\tikli\Desktop\NanoToxML\Tox_DB_Updated.csv


In [8]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'
data = pd.read_csv(data_path)

# Drop the 'Cell Morphology' and 'Cell Age' columns
data_cleaned = data.drop(columns=['cell morphology', 'cell age'])

# Overwrite the original dataset with the updated data
data_cleaned.to_csv(data_path, index=False)

print("Columns 'Cell Morphology' and 'Cell Age' have been removed and the dataset has been updated.")

Columns 'Cell Morphology' and 'Cell Age' have been removed and the dataset has been updated.


### 2.4. Info about data before encoding

In [9]:
# Path to the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'
data = pd.read_csv(data_path)

# Function to display detailed information about the dataset
def dataset_detailed_overview(dataframe):
    # Calculate the percentage of missing values for each column
    missing_percentage = dataframe.isnull().mean() * 100
    
    # Print header
    print("Column Name - Missing Values (%) - Data Type")
    
    # Loop through each column and print the desired information
    for col in dataframe.columns:
        print(f"{col} - {missing_percentage[col]:.2f}% - {dataframe[col].dtype}")

# Call the function with your dataset
dataset_detailed_overview(data)

Column Name - Missing Values (%) - Data Type
material - 0.00% - object
cell source - 7.48% - object
cell tissue - 7.48% - object
time (hr) - 7.48% - float64
test - 7.48% - object
test indicator - 16.15% - object
viability (%) - 0.00% - float64
Hydrodynamic diameter (nm) - 2.23% - float64
Cell type - 7.48% - object
Molecular weight (g/mol) - 16.15% - float64


## 3. Adding additional database

In [12]:
# Paths to your datasets
path_tox = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'
path_props = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\inorg prop - 1.csv'

# Load the datasets
tox_data = pd.read_csv(path_tox)
props_data = pd.read_csv(path_props)

# Select only the relevant columns from the properties dataset
props_data_selected = props_data[['formula', 'AR', 'IR', 'X', 'E', 'pot']]

# Merging the datasets on 'material' from tox_data and 'formula' from props_data
merged_data = pd.merge(tox_data, props_data_selected, left_on='material', right_on='formula', how='left')

# Drop the 'formula' column as it is redundant after the merge
merged_data.drop(columns=['formula'], inplace=True)

# Save the merged dataset to a new CSV file
merged_data.to_csv('C:\\Users\\tikli\\Desktop\\NanoToxML\\Merged_NanoToxML_Data.csv', index=False)

print("The data has been successfully merged and saved.")

The data has been successfully merged and saved.


## 4. Encoding the data

In [13]:
def encode_column(dataframe, column_name):
    """Encodes the specified column of the dataframe using unique integer labels."""
    unique_values = sorted(dataframe[column_name].dropna().unique())
    mapping_dict = {value: idx for idx, value in enumerate(unique_values)}
    return dataframe[column_name].map(mapping_dict), mapping_dict

def main():
    # Load the dataset
    file_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Merged_NanoToxML_Data.csv' 
    data = pd.read_csv(file_path)
    
    # Encoding categorical columns
    data['material'], material_map = encode_column(data, 'material')
    data['Cell type'], cell_type_map = encode_column(data, 'Cell type')
    data['cell source'], cell_source_map = encode_column(data, 'cell source')
    data['cell tissue'], cell_tissue_map = encode_column(data, 'cell tissue')
    data['test'], test_map = encode_column(data, 'test')
    data['test indicator'], test_indicator_map = encode_column(data, 'test indicator')

    # Save the encoded dataset
    output_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Encoded.csv'
    data.to_csv(output_path, index=False)
    
    # Print the mappings for reference
    print("Material Mapping:", material_map)
    print("Cell Type Mapping:", cell_type_map)
    print("Cell Source Mapping:", cell_source_map)
    print("Cell Tissue Mapping:", cell_tissue_map)
    print("Test Mapping:", test_map)
    print("Test Indicator Mapping:", test_indicator_map)

if __name__ == "__main__":
    main()

Material Mapping: {'Ag': 0, 'Al2O3': 1, 'Au': 2, 'Bi': 3, 'Bi2O3': 4, 'C': 5, 'CaHCO3': 6, 'CdO': 7, 'CeO2': 8, 'Co': 9, 'Co3O4': 10, 'CoO': 11, 'Cr': 12, 'Cu': 13, 'Cu2O': 14, 'CuO': 15, 'CuS': 16, 'Fe2O3': 17, 'Fe3O4': 18, 'Gd2O3': 19, 'HfO2': 20, 'In2O3': 21, 'La2O3': 22, 'MgO': 23, 'Mn2O3': 24, 'MnO': 25, 'Mo': 26, 'Ni': 27, 'NiO': 28, 'Pt': 29, 'Sb2O3': 30, 'Se': 31, 'SiO2': 32, 'Ti': 33, 'TiO2': 34, 'ZnO': 35, 'ZrO2': 36}
Cell Type Mapping: {'143B': 0, '16HBE': 1, '3T3-L1': 2, '95D': 3, 'A2780': 4, 'A431': 5, 'A549': 6, 'AGS': 7, 'ASM': 8, 'Ana-1': 9, 'B cells': 10, 'BALB/c3T3': 11, 'BEAS-2B': 12, 'BEAS_2B': 13, 'BEAS\xad2B': 14, 'BEC': 15, 'BJ': 16, 'C17.2': 17, 'C18–4': 18, 'C3A': 19, 'C6': 20, 'CCL-110': 21, 'CD3+ T cells': 22, 'CD4+T cells': 23, 'CDBgeo': 24, 'CHO-K1': 25, 'CHO22': 26, 'CHO\xadK1': 27, 'Caco-2': 28, 'Caco_2': 29, 'Caco\xad2': 30, 'Chang_Liver': 31, 'EBF': 32, 'ECV304': 33, 'EJ28': 34, 'Fibroblasts': 35, 'GH3': 36, 'H1299': 37, 'H4': 38, 'HAEC': 39, 'HCMEC': 4

## 5. Handling missing values with k-NN