## Development of a Machine Learning Model to Predict the Cytotoxicity of Nanoparticles in Cell Cultures

## 1. Libraries

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Preliminary adjustments

In [22]:
# Path to the dataset
data_path = "C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB.csv"
data = pd.read_csv(data_path)

# Function to display detailed information about the dataset
def dataset_detailed_overview(dataframe):
    # Calculate the percentage of missing values for each column
    missing_percentage = dataframe.isnull().mean() * 100
    
    # Print header
    print("Column Name - Missing Values (%) - Data Type")
    
    # Loop through each column and print the desired information
    for col in dataframe.columns:
        print(f"{col} - {missing_percentage[col]:.2f}% - {dataframe[col].dtype}")

# Call the function with your dataset
dataset_detailed_overview(data)

Column Name - Missing Values (%) - Data Type
Unnamed: 0 - 0.00% - int64
material - 0.00% - object
shape - 49.19% - object
coat/functional group - 69.00% - object
synthesismethod - 49.19% - object
surface charge - 47.60% - object
size in medium (nm) - 73.96% - float64
zeta in medium (mV) - 79.97% - float64
no of cells (cells/well) - 53.89% - float64
human/animal - 16.15% - object
cell source - 7.48% - object
cell tissue - 7.48% - object
cell morphology - 16.15% - object
cell age - 16.15% - object
time (hr) - 7.48% - float64
concentration (ug/ml) - 36.07% - float64
test - 7.48% - object
test indicator - 16.15% - object
viability (%) - 0.00% - float64
DOI - 16.15% - object
core size (nm) - 83.85% - float64
surface area - 83.85% - float64
Hydrodynamic diameter (nm) - 2.23% - float64
Zeta potential (mV) - 41.96% - float64
Cell type - 7.48% - object
Molecular weight (g/mol) - 16.15% - float64


In [23]:
# Drop the unnecessary columns
data_cleaned = data.drop(['Unnamed: 0', 'DOI'], axis=1)

# Define the path for the new CSV file
new_data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'

# Save the cleaned dataset to a new CSV file
data_cleaned.to_csv(new_data_path, index=False)

print(f"Cleaned dataset saved to {new_data_path}")

Cleaned dataset saved to C:\Users\tikli\Desktop\NanoToxML\Tox_DB_Cleaned.csv


In [24]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'
data = pd.read_csv(data_path)

# Get all unique variations in the 'cell source' column
unique_cell_sources = data['cell source'].unique()

# Print all unique cell sources
print("Unique Cell Sources:")
for source in unique_cell_sources:
    print(source)

Unique Cell Sources:
Rat
Mouse
Monkey
Dog
Human
Porcine
Rabbit
Catfish
Hamster
Pig
Monkey (Cercopithecus aethiops)
Canine
hamster
nan


In [25]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'
data = pd.read_csv(data_path)

# Check for missing values in both columns
missing_human_animal = data['human/animal'].isnull()
missing_cell_source = data['cell source'].isnull()

# Calculate the scenarios
both_missing = data[missing_human_animal & missing_cell_source].shape[0]
only_human_animal_missing = data[missing_human_animal & ~missing_cell_source].shape[0]
only_cell_source_missing = data[~missing_human_animal & missing_cell_source].shape[0]

# Print the results
print(f"Rows where both 'human/animal' and 'cell source' are missing: {both_missing}")
print(f"Rows where only 'human/animal' is missing and 'cell source' is available: {only_human_animal_missing}")
print(f"Rows where only 'cell source' is missing and 'human/animal' is available: {only_cell_source_missing}")

Rows where both 'human/animal' and 'cell source' are missing: 494
Rows where only 'human/animal' is missing and 'cell source' is available: 572
Rows where only 'cell source' is missing and 'human/animal' is available: 0


In [26]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'
data = pd.read_csv(data_path)

# Drop the 'human/animal' column
data_dropped = data.drop('human/animal', axis=1)

# Define the path for the new CSV file
new_data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'

# Save the updated dataset to a new CSV file
data_dropped.to_csv(new_data_path, index=False)

print(f"Updated dataset saved to {new_data_path}")

Updated dataset saved to C:\Users\tikli\Desktop\NanoToxML\Tox_DB_Updated.csv


In [27]:
# Path to the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'
data = pd.read_csv(data_path)

# Function to display detailed information about the dataset
def dataset_detailed_overview(dataframe):
    # Calculate the percentage of missing values for each column
    missing_percentage = dataframe.isnull().mean() * 100
    
    # Print header
    print("Column Name - Missing Values (%) - Data Type")
    
    # Loop through each column and print the desired information
    for col in dataframe.columns:
        print(f"{col} - {missing_percentage[col]:.2f}% - {dataframe[col].dtype}")

# Call the function with your dataset
dataset_detailed_overview(data)

Column Name - Missing Values (%) - Data Type
material - 0.00% - object
shape - 49.19% - object
coat/functional group - 69.00% - object
synthesismethod - 49.19% - object
surface charge - 47.60% - object
size in medium (nm) - 73.96% - float64
zeta in medium (mV) - 79.97% - float64
no of cells (cells/well) - 53.89% - float64
cell source - 7.48% - object
cell tissue - 7.48% - object
cell morphology - 16.15% - object
cell age - 16.15% - object
time (hr) - 7.48% - float64
concentration (ug/ml) - 36.07% - float64
test - 7.48% - object
test indicator - 16.15% - object
viability (%) - 0.00% - float64
core size (nm) - 83.85% - float64
surface area - 83.85% - float64
Hydrodynamic diameter (nm) - 2.23% - float64
Zeta potential (mV) - 41.96% - float64
Cell type - 7.48% - object
Molecular weight (g/mol) - 16.15% - float64
