## Development of a Machine Learning Model to Predict the Cytotoxicity of Nanoparticles in Cell Cultures

## 1. Libraries

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymatgen

## 2. Preliminary adjustments
### 2.1. Info about dataset

In [43]:
# Path to the dataset
data_path = "C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB.csv"
data = pd.read_csv(data_path)

# Function to display detailed information about the dataset
def dataset_detailed_overview(dataframe):
    # Calculate the percentage of missing values for each column
    missing_percentage = dataframe.isnull().mean() * 100
    
    # Print header
    print("Column Name - Missing Values (%) - Data Type")
    
    # Loop through each column and print the desired information
    for col in dataframe.columns:
        print(f"{col} - {missing_percentage[col]:.2f}% - {dataframe[col].dtype}")

# Call the function with your dataset
dataset_detailed_overview(data)

Column Name - Missing Values (%) - Data Type
Unnamed: 0 - 0.00% - int64
material - 0.00% - object
shape - 49.19% - object
coat/functional group - 69.00% - object
synthesismethod - 49.19% - object
surface charge - 47.60% - object
size in medium (nm) - 73.96% - float64
zeta in medium (mV) - 79.97% - float64
no of cells (cells/well) - 53.89% - float64
human/animal - 16.15% - object
cell source - 7.48% - object
cell tissue - 7.48% - object
cell morphology - 16.15% - object
cell age - 16.15% - object
time (hr) - 7.48% - float64
concentration (ug/ml) - 36.07% - float64
test - 7.48% - object
test indicator - 16.15% - object
viability (%) - 0.00% - float64
DOI - 16.15% - object
core size (nm) - 83.85% - float64
surface area - 83.85% - float64
Hydrodynamic diameter (nm) - 2.23% - float64
Zeta potential (mV) - 41.96% - float64
Cell type - 7.48% - object
Molecular weight (g/mol) - 16.15% - float64


### 2.2. Dropping Unnecessary colomns

In [44]:
# Drop the unnecessary columns
data_cleaned = data.drop(['Unnamed: 0', 'DOI'], axis=1)

# Define the path for the new CSV file
new_data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'

# Save the cleaned dataset to a new CSV file
data_cleaned.to_csv(new_data_path, index=False)

print(f"Cleaned dataset saved to {new_data_path}")

Cleaned dataset saved to C:\Users\tikli\Desktop\NanoToxML\Tox_DB_Cleaned.csv


In [45]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'
data = pd.read_csv(data_path)

# Get all unique variations in the 'cell source' column
unique_cell_sources = data['cell source'].unique()

# Print all unique cell sources
print("Unique Cell Sources:")
for source in unique_cell_sources:
    print(source)

Unique Cell Sources:
Rat
Mouse
Monkey
Dog
Human
Porcine
Rabbit
Catfish
Hamster
Pig
Monkey (Cercopithecus aethiops)
Canine
hamster
nan


In [46]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'
data = pd.read_csv(data_path)

# Check for missing values in both columns
missing_human_animal = data['human/animal'].isnull()
missing_cell_source = data['cell source'].isnull()

# Calculate the scenarios
both_missing = data[missing_human_animal & missing_cell_source].shape[0]
only_human_animal_missing = data[missing_human_animal & ~missing_cell_source].shape[0]
only_cell_source_missing = data[~missing_human_animal & missing_cell_source].shape[0]

# Print the results
print(f"Rows where both 'human/animal' and 'cell source' are missing: {both_missing}")
print(f"Rows where only 'human/animal' is missing and 'cell source' is available: {only_human_animal_missing}")
print(f"Rows where only 'cell source' is missing and 'human/animal' is available: {only_cell_source_missing}")

Rows where both 'human/animal' and 'cell source' are missing: 494
Rows where only 'human/animal' is missing and 'cell source' is available: 572
Rows where only 'cell source' is missing and 'human/animal' is available: 0


In [47]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Cleaned.csv'
data = pd.read_csv(data_path)

# Drop the 'human/animal' column
data_dropped = data.drop('human/animal', axis=1)

# Define the path for the new CSV file
new_data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'

# Save the updated dataset to a new CSV file
data_dropped.to_csv(new_data_path, index=False)

print(f"Updated dataset saved to {new_data_path}")

Updated dataset saved to C:\Users\tikli\Desktop\NanoToxML\Tox_DB_Updated.csv


### 2.3. Dropping colomns with more than 20% missing values and less meaningful ones

In [48]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'
data = pd.read_csv(data_path)

# Calculate the percentage of missing values for each column
missing_percentages = data.isnull().mean() * 100

# Identify and drop columns with more than 20% missing values
columns_to_drop = missing_percentages[missing_percentages > 20].index
data_cleaned = data.drop(columns=columns_to_drop)

# Overwrite the original dataset with the cleaned data
data_cleaned.to_csv(data_path, index=False)

print("The dataset has been updated and overwritten at", data_path)

The dataset has been updated and overwritten at C:\Users\tikli\Desktop\NanoToxML\Tox_DB_Updated.csv


In [49]:
# Load the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'
data = pd.read_csv(data_path)

# Drop the 'Cell Morphology' and 'Cell Age' columns
data_cleaned = data.drop(columns=['cell morphology', 'cell age'])

# Overwrite the original dataset with the updated data
data_cleaned.to_csv(data_path, index=False)

print("Columns 'Cell Morphology' and 'Cell Age' have been removed and the dataset has been updated.")

Columns 'Cell Morphology' and 'Cell Age' have been removed and the dataset has been updated.


### 2.4. Info about data before encoding

In [50]:
# Path to the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'
data = pd.read_csv(data_path)

# Function to display detailed information about the dataset
def dataset_detailed_overview(dataframe):
    # Calculate the percentage of missing values for each column
    missing_percentage = dataframe.isnull().mean() * 100
    
    # Print header
    print("Column Name - Missing Values (%) - Data Type")
    
    # Loop through each column and print the desired information
    for col in dataframe.columns:
        print(f"{col} - {missing_percentage[col]:.2f}% - {dataframe[col].dtype}")

# Call the function with your dataset
dataset_detailed_overview(data)

Column Name - Missing Values (%) - Data Type
material - 0.00% - object
cell source - 7.48% - object
cell tissue - 7.48% - object
time (hr) - 7.48% - float64
test - 7.48% - object
test indicator - 16.15% - object
viability (%) - 0.00% - float64
Hydrodynamic diameter (nm) - 2.23% - float64
Cell type - 7.48% - object
Molecular weight (g/mol) - 16.15% - float64


## 3. Adding additional database

### 3.1. Merging the databases

In [51]:
# Define the file paths
tox_data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Updated.csv'
inorg_prop_data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\inorg prop - 1.csv'
output_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Merged_Tox_Data.csv'

# Load the datasets
tox_data = pd.read_csv(tox_data_path)
inorg_prop_data = pd.read_csv(inorg_prop_data_path)

# Handle duplicates by averaging the values for each formula
inorg_prop_data_aggregated = inorg_prop_data.groupby('formula').mean().reset_index()

# Create a dictionary from the aggregated inorganic properties data for quick lookup
inorg_prop_dict = inorg_prop_data_aggregated.set_index('formula').to_dict('index')

# Add columns for inorganic properties to the tox_data dataframe
tox_data['AR'] = tox_data['material'].map(lambda x: inorg_prop_dict.get(x, {}).get('AR'))
tox_data['IR'] = tox_data['material'].map(lambda x: inorg_prop_dict.get(x, {}).get('IR'))
tox_data['X'] = tox_data['material'].map(lambda x: inorg_prop_dict.get(x, {}).get('X'))
tox_data['E'] = tox_data['material'].map(lambda x: inorg_prop_dict.get(x, {}).get('E'))
tox_data['pot'] = tox_data['material'].map(lambda x: inorg_prop_dict.get(x, {}).get('pot'))

# Save the merged data to a new CSV file
tox_data.to_csv(output_path, index=False)

print(f"Merged data saved to {output_path}")

Merged data saved to C:\Users\tikli\Desktop\NanoToxML\Merged_Tox_Data.csv


In [52]:
# Path to the dataset
data_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Merged_Tox_Data.csv'
data = pd.read_csv(data_path)

# Function to display detailed information about the dataset
def dataset_detailed_overview(dataframe):
    # Calculate the percentage of missing values for each column
    missing_percentage = dataframe.isnull().mean() * 100
    
    # Print header
    print("Column Name - Missing Values (%) - Data Type")
    
    # Loop through each column and print the desired information
    for col in dataframe.columns:
        print(f"{col} - {missing_percentage[col]:.2f}% - {dataframe[col].dtype}")

# Call the function with your dataset
dataset_detailed_overview(data)

Column Name - Missing Values (%) - Data Type
material - 0.00% - object
cell source - 7.48% - object
cell tissue - 7.48% - object
time (hr) - 7.48% - float64
test - 7.48% - object
test indicator - 16.15% - object
viability (%) - 0.00% - float64
Hydrodynamic diameter (nm) - 2.23% - float64
Cell type - 7.48% - object
Molecular weight (g/mol) - 16.15% - float64
AR - 34.22% - float64
IR - 34.22% - float64
X - 34.22% - float64
E - 34.22% - float64
pot - 53.66% - float64


### 3.2. Adjustment to the database

In [53]:
import re
from collections import Counter

# Define the path to the directory
directory_path = "C:\\Users\\tikli\\Desktop\\NanoToxML"

# Load the datasets
tox_db_updated_path = f"{directory_path}\\Tox_DB_Updated.csv"
inorg_prop_path = f"{directory_path}\\inorg prop - 1.csv"
redox_redox_path = f"{directory_path}\\redox - redox.csv"
merged_tox_data_path = f"{directory_path}\\Merged_Tox_Data.csv"

tox_db_updated = pd.read_csv(tox_db_updated_path)
inorg_prop = pd.read_csv(inorg_prop_path)
redox_redox = pd.read_csv(redox_redox_path)
merged_tox_data = pd.read_csv(merged_tox_data_path)

# Function to decompose a chemical formula into its constituent elements
def parse_formula(formula):
    """
    Parse a chemical formula into its constituent elements.
    """
    # Regex to match elements and their counts
    pattern = r'([A-Z][a-z]*)(\d*)'
    parts = re.findall(pattern, formula)
    element_counts = Counter()
    for element, count in parts:
        if count == '':
            count = 1
        else:
            count = int(count)
        element_counts[element] += count
    return element_counts

# Function to calculate the average redox potential based on the constituent elements
def calculate_average_redox_potential(elements, redox_data):
    """
    Calculate the average redox potential of a material based on its constituent elements.
    """
    total_potential = 0
    total_elements = 0
    for element, count in elements.items():
        element_potential = redox_data[redox_data['from'].str.contains(element)]['potential (V)']
        if not element_potential.empty:
            total_potential += element_potential.mean() * count
            total_elements += count
    if total_elements > 0:
        return total_potential / total_elements
    else:
        return None

# Parse the material formulas in `merged_tox_data`
merged_tox_data['parsed_formula'] = merged_tox_data['material'].apply(parse_formula)

# Calculate the average redox potential for each material
merged_tox_data['average_redox_potential (V)'] = merged_tox_data['parsed_formula'].apply(calculate_average_redox_potential, redox_data=redox_redox)

# Identify rows with missing 'pot' values
missing_pot_indices = merged_tox_data['pot'].isna()

# Fill missing 'pot' values with the calculated average redox potentials
merged_tox_data.loc[missing_pot_indices, 'pot'] = merged_tox_data.loc[missing_pot_indices, 'average_redox_potential (V)']

# Drop the helper columns used for calculation
merged_tox_data.drop(columns=['parsed_formula', 'average_redox_potential (V)'], inplace=True)

# Save the updated dataset
updated_merged_tox_data_path = f"{directory_path}\\Updated_Merged_Tox_Data.csv"
merged_tox_data.to_csv(updated_merged_tox_data_path, index=False)

print(f"Updated dataset saved to {updated_merged_tox_data_path}")

Updated dataset saved to C:\Users\tikli\Desktop\NanoToxML\Updated_Merged_Tox_Data.csv


## 4. Getting additional data from Pymatgen

In [None]:
from pymatgen.ext.matproj import MPRester
from pymatgen.core import Composition

# Define your API key for the Materials Project
API_KEY = "eVquqWiqZA7oOoFXmQi"

# Load the updated dataset
file_path = "C:\\Users\\tikli\\Desktop\\NanoToxML\\Updated_Merged_Tox_Data.csv"
data = pd.read_csv(file_path)

# Initialize MPRester with your API key
mpr = MPRester(API_KEY)

# Function to get properties from Materials Project
def get_material_properties(formula):
    try:
        # Search for materials with the given formula
        results = mpr.query({"pretty_formula": formula}, 
                            ["material_id", "band_gap", "density", "formation_energy_per_atom"])
        if results:
            # Get the first result
            result = results[0]
            return result["material_id"], result["band_gap"], result["density"], result["formation_energy_per_atom"]
        else:
            return None, None, None, None
    except Exception as e:
        print(f"Error fetching data for {formula}: {e}")
        return None, None, None, None

# Add new columns to the DataFrame
data["material_id"] = None
data["band_gap"] = None
data["density"] = None
data["formation_energy_per_atom"] = None

# Loop through the dataset and fetch properties
for index, row in data.iterrows():
    formula = row["material"]
    material_id, band_gap, density, formation_energy = get_material_properties(formula)
    data.at[index, "material_id"] = material_id
    data.at[index, "band_gap"] = band_gap
    data.at[index, "density"] = density
    data.at[index, "formation_energy_per_atom"] = formation_energy

# Save the enhanced dataset
enhanced_file_path = "C:\\Users\\tikli\\Desktop\\NanoToxML\\After_Pymatgen_Tox_Data.csv"
data.to_csv(enhanced_file_path, index=False)

print(f"Enhanced dataset saved to {enhanced_file_path}")

## 5. Encoding the data

def encode_column(dataframe, column_name):
    """Encodes the specified column of the dataframe using unique integer labels."""
    unique_values = sorted(dataframe[column_name].dropna().unique())
    mapping_dict = {value: idx for idx, value in enumerate(unique_values)}
    return dataframe[column_name].map(mapping_dict), mapping_dict

def main():
    # Load the dataset
    file_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Merged_Tox_Data.csv'
    data = pd.read_csv(file_path)
    
    # Encoding categorical columns
    data['material'], material_map = encode_column(data, 'material')
    data['Cell type'], cell_type_map = encode_column(data, 'Cell type')
    data['cell source'], cell_source_map = encode_column(data, 'cell source')
    data['cell tissue'], cell_tissue_map = encode_column(data, 'cell tissue')
    data['test'], test_map = encode_column(data, 'test')
    data['test indicator'], test_indicator_map = encode_column(data, 'test indicator')

    # Save the encoded dataset
    output_path = 'C:\\Users\\tikli\\Desktop\\NanoToxML\\Tox_DB_Encoded.csv'
    data.to_csv(output_path, index=False)
    
    # Print the mappings for reference
    print("Material Mapping:", material_map)
    print("Cell Type Mapping:", cell_type_map)
    print("Cell Source Mapping:", cell_source_map)
    print("Cell Tissue Mapping:", cell_tissue_map)
    print("Test Mapping:", test_map)
    print("Test Indicator Mapping:", test_indicator_map)

if __name__ == "__main__":
    main()