This notebook takes in the raw data from each paper, and puts everything together in a standard format, so we can run further analysis.

In [202]:
import numpy as np
import pandas as pd

In [203]:
# FINAL LIST
"""
NP Type (includes QD)

QD Concentration (nMol)
QD Core
QD Shell
QD Surface Ligand

Zeta Potential
Hydrodynamic size
Diameter/Size
Surface area
Surface charge
Shape

Surface coating

Surface reactivity
Aggregation

Molecular Weight
Ionic radius
Electronegativity

Emission Wavelength

Conduction band energy
Number of oxygen atoms

Cell origin (species)
Cell origin (organ)
Cell origin (anatomical)
Cell origin (primary or cell-line)
Cell name

Concentration/dose

Assay

Exposure Time
Dose
"""
None

## Pre-processing
We start by extracting and normalizing all the features of relevance for each of our datasets.

In [204]:
# Bilal, 2019
"""
QD-source
Core
Shell
QD Diameter
Emission Wavelength
Surface Ligand
Ligand Chemical
Surface Charge
Surface Modification
Surface Modification Chemical
Cell Anatomical Type
Cell Identification
Cell Source Species
Cell Origin
Cell Tissue/Organ Origin
Assay Type
Delivery Type
Exposure Time
"""
bilal_raw = pd.read_csv('raw_datasets/bilal_2019.csv')
bilal_processed = bilal_raw.rename(columns={
    'QD-diameter-nm': 'Diameter',
    'QD-conc-nanoMolar': 'QD Concentration',
    'Core': 'QD Core',
    'Shell': 'QD Shell',
    'Emission-wavelength-nm': 'QD Emission wavelength',
    'Surface-ligand': 'QD Surface ligand',
    'Surface-charge': 'Surface charge (categorical)',
    'Cell-anatomical-type': 'Cell origin (anatomical)',
    'Cell-identification': 'Cell name',
    'Cell-source-species': 'Cell origin (species)',
    'Cell-origin': 'Cell origin (primary or cell-line)',
    'Cell-tissue-organ-origin': 'Cell origin (organ)',
    'Assay-type': 'Assay',
    'Exposure-time-hrs': 'Exposure time',
    'Cell-viability-percent': 'Target',
})[:3028][[
    'Diameter', 'Surface charge (categorical)', 'Exposure time',
    'Cell origin (species)', 'Cell origin (organ)', 'Cell origin (anatomical)', 'Cell origin (primary or cell-line)', 'Cell name',
    'QD Concentration', 'QD Core', 'QD Shell', 'QD Emission wavelength', 'QD Surface ligand',
    'Assay', 'Target'
]]
bilal_processed['Target'] /= 100.0
bilal_processed['Source'] = 'Bilal, 2019'
bilal_processed['NP Type'] = 'QD'
bilal_processed

Unnamed: 0,Diameter,Surface charge (categorical),Exposure time,Cell origin (species),Cell origin (organ),Cell origin (anatomical),Cell origin (primary or cell-line),Cell name,QD Concentration,QD Core,QD Shell,QD Emission wavelength,QD Surface ligand,Assay,Target,Source,NP Type
0,3.4,Negative,22.0,Human,Bronchial-tracheal,Epithelial,Primary,NHBE,2300.0,CdSe,No-shell,,Alkylthiol,WST,0.250,"Bilal, 2019",QD
1,5.0,Negative,22.0,Human,Bronchial-tracheal,Epithelial,Primary,NHBE,720.0,CdSe,No-shell,,Alkylthiol,WST,0.610,"Bilal, 2019",QD
2,9.5,Negative,22.0,Human,Bronchial-tracheal,Epithelial,Primary,NHBE,100.0,CdSe,No-shell,,Alkylthiol,WST,0.830,"Bilal, 2019",QD
3,3.4,Negative,22.0,Human,Bronchial-tracheal,Epithelial,Primary,NHBE,2300.0,CdSe,No-shell,,Alkylthiol,WST,1.350,"Bilal, 2019",QD
4,5.0,Negative,22.0,Human,Bronchial-tracheal,Epithelial,Primary,NHBE,720.0,CdSe,No-shell,,Alkylthiol,WST,0.830,"Bilal, 2019",QD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3023,4.9,Zwitterion,48.0,Human,Breast,Epithelial,Cell-line,KPL-4,50.0,CdSeTe,CdS,830.0,Aminoacid,MTT,0.426,"Bilal, 2019",QD
3024,5.5,Neutral,24.0,Human,Cervix,Epithelial,Cell-line,HeLa,20.0,CdSe,ZnS,605.0,Lipid,MTT,0.720,"Bilal, 2019",QD
3025,5.5,Neutral,48.0,Human,Cervix,Epithelial,Cell-line,HeLa,20.0,CdSe,ZnS,605.0,Lipid,MTT,0.680,"Bilal, 2019",QD
3026,5.5,Neutral,24.0,Human,Cervix,Epithelial,Cell-line,HeLa,20.0,CdSe,ZnS,605.0,Lipid,MTT,0.730,"Bilal, 2019",QD


In [205]:
# Furxhi, 2020
"""
Dose
Time
Zeta potential in Water
Zeta W Measurement
Zet potential in cellular Media
Zeta M Measurement
Hydrodynamic size in Water
Hydro size W Measurement
Hydrodynamic size in cellular Media
Hydro size M Measurement
Size
Size Measurement
SSA (surface area?)
Surface area Measurement
NP Type
Shape
Shape measurement
Cell origin
Cell name
Cell type
Assay
"""
furxhi_raw = pd.read_csv('raw_datasets/furxhi_2020.csv')
furxhi_processed = furxhi_raw.copy()

def impute_w(w, m):
    from sklearn.linear_model import LinearRegression
    clean_mask = (w != '?') & (m != '?')
    w_clean, m_clean = w[clean_mask].values, m[clean_mask].values
    lr = LinearRegression()
    lr.fit(m_clean[:, None], w_clean)
    new_w = w.values.copy()
    pred_mask =  (w == '?') & (m != '?')
    new_w[pred_mask] = lr.predict(m[pred_mask].values[:, None])
    return new_w

furxhi_processed['Hydrodynamic size'] = impute_w(furxhi_processed['Hydro_W'], furxhi_processed['Hydro_M'])
furxhi_processed['Zeta potential'] = impute_w(furxhi_processed['Zeta_W'], furxhi_processed['Zeta_M'])
furxhi_processed = furxhi_processed.rename(columns={
    'Time': 'Exposure time',
    'Size': 'Diameter',
    'SSA': 'Surface area',
    'shape': 'Shape',
    'C_Origin': 'Cell origin (species)',
    'C_name': 'Cell name',
    'C_Type': 'Cell origin (anatomical)',
    'Cell viability': 'Target',
    'NP type': 'NP Type',
})[[
    'NP Type', 'Dose', 'Exposure time', 'Zeta potential', 'Hydrodynamic size', 'Diameter', 'Surface area',
    'Shape', 'Cell origin (species)', 'Cell origin (anatomical)', 'Cell name', 'Assay', 'Target'
]].replace('?', np.nan)
furxhi_processed['Target'] = furxhi_processed['Target'].map({'Toxic': 0.0, 'Non-Toxic': 1.0})
furxhi_processed['Source'] = 'Furxhi, 2020'
furxhi_processed

Unnamed: 0,NP Type,Dose,Exposure time,Zeta potential,Hydrodynamic size,Diameter,Surface area,Shape,Cell origin (species),Cell origin (anatomical),Cell name,Assay,Target,Source
0,CuO,10.00,4,-47.6,,40.0,,,Rat,Endothelial,BMEC,XTT,0.0,"Furxhi, 2020"
1,CuO,10.00,4,-36.6,,60.0,,,Rat,Endothelial,BMEC,XTT,0.0,"Furxhi, 2020"
2,CuO,1.56,24,-47.6,,40.0,,,Rat,Endothelial,BMEC,XTT,0.0,"Furxhi, 2020"
3,CuO,1.56,24,-36.6,,60.0,,,Rat,Endothelial,BMEC,XTT,0.0,"Furxhi, 2020"
4,CuO,3.13,24,-47.6,,40.0,,,Rat,Endothelial,BMEC,XTT,0.0,"Furxhi, 2020"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,ZnO,4.00,48,,,35.0,,Spherical,Rat,Schwann,RSC96,MTT,1.0,"Furxhi, 2020"
598,ZnO,8.00,48,,,35.0,,Spherical,Rat,Schwann,RSC96,MTT,1.0,"Furxhi, 2020"
599,ZnO,40.00,48,,,35.0,,Spherical,Rat,Schwann,RSC96,MTT,1.0,"Furxhi, 2020"
600,ZnO,80.00,48,,,35.0,,Spherical,Rat,Schwann,RSC96,MTT,0.0,"Furxhi, 2020"


In [206]:
# Marvin, 2017

"""
Shape *
NP Type *
Surface area *
Surface charge *
Surface coatings *
Surface reactivity *
Aggregation *
Particle size *
Administration route
Study type (in vivo/vitro)
"""
marvin_raw = pd.read_csv('raw_datasets/marvin_2017/marvin_2017_1.csv', skiprows=0)
marvin_processed = marvin_raw[
    ~(marvin_raw['Cytotoxicity'].isna() | marvin_raw['Cytotoxicity'].isin(['NaN', 'None']))
].reset_index(drop=True)[[
    'Nanoparticle', 'Shape',
    'Surface area', 'Surface charge', 'Surface coatings', 'Surface reactivity',
    'Aggregation', 'Particle size', 'Cytotoxicity'
]].rename(columns={
    'Nanoparticle': 'NP Type',
    'Surface coatings': 'Surface coating',
    'Particle size': 'Diameter',
    'Cytotoxicity': 'Target',
})[:-1]
marvin_processed['Surface area'] = marvin_processed['Surface area'].map(
    lambda x: x if pd.isna(x) else sum(map(float, x.split(' - '))) * 0.5
)
marvin_processed['Surface charge'] = marvin_processed['Surface charge'].map(
    lambda x: x if pd.isna(x) else sum(map(float, x.split(' to '))) * 0.5
)
marvin_processed['Diameter'] = marvin_processed['Diameter'].map(
    lambda x: x if pd.isna(x) else (float(x[1:]) if x.startswith('>') else (sum(map(float, x.split(' to '))) * 0.5))
)
marvin_processed['Target'] = marvin_processed['Target'].map({'Low': 1.0, 'Medium': 0.5, 'High': 0.0}.get)
marvin_processed['Surface charge (categorical)'] = \
    marvin_processed['Surface charge'].map(lambda x: x if pd.isna(x) else ('Positive' if x > 0.0 else 'Negative'))

marvin_processed = marvin_processed[marvin_processed['NP Type'] != 'C'].reset_index(drop=True)
marvin_processed['Source'] = 'Marvin, 2017'
marvin_processed

Unnamed: 0,NP Type,Shape,Surface area,Surface charge,Surface coating,Surface reactivity,Aggregation,Diameter,Target,Surface charge (categorical),Source
0,Cellulose,Elongated,,,Sulphate,,Low,100.0,1.0,,"Marvin, 2017"
1,Cellulose,Elongated,,,Sulphate,,Low,100.0,1.0,,"Marvin, 2017"
2,Cellulose,Elongated,,,Sulphate,,Low,100.0,1.0,,"Marvin, 2017"
3,Cellulose,Elongated,,,Sulphate,,Low,100.0,1.0,,"Marvin, 2017"
4,Cellulose,Elongated,,,Sulphate,,Low,100.0,1.0,,"Marvin, 2017"
...,...,...,...,...,...,...,...,...,...,...,...
162,Yb2O3,Irregular,,,,,,75.0,1.0,,"Marvin, 2017"
163,ZnO,Irregular,,,,Low,,30.0,0.0,,"Marvin, 2017"
164,ZnO,Irregular,,,,Low,,30.0,0.0,,"Marvin, 2017"
165,ZnO,Irregular,,,,Low,,30.0,0.0,,"Marvin, 2017"


In [207]:
# Shirokii, 2023
"""
NP Type *
Diameter *
Zeta potential *
Cell type * Cell name
Concentration *
Animal * Cell origin (species)
Cell morphology * Cell origin (anatomical)
Cell organ * Cell origin (organ)
Line/primary cell * Cell origin (primary or cell-line)
Test * (assay)
Test indicator
Coat * (Surface coating)
Time * (Exposure time)
Molecular Weight *
Ionic radius *
Electronegativity *
"""

shirokii_raw = pd.read_csv('raw_datasets/shirokii_2023.csv')
shirokii_processed = shirokii_raw.drop(columns=['Test_indicator', 'Unnamed: 0']).rename(columns={
    'Material': 'NP Type',
    'Diameter (nm)': 'Diameter',
    'Zeta potential (mV)': 'Zeta potential',
    'Cell_type': 'Cell name',
    'Concentration (g/L)': 'Dose',
    'Viability (%)': 'Target',
    'Coat': 'Surface coating',
    'Line_Primary_Cell': 'Cell origin (primary or cell-line)',
    'Animal': 'Cell origin (species)',
    'Cell_morphology': 'Cell origin (anatomical)',
    'Cell_organ': 'Cell origin (organ)',
    'Time (h)': 'Exposure time',
    'Test': 'Assay',
    'Molecular weight (g/mol)': 'Molecular weight',
}).drop(columns=['Cell_age', 'Elements'])
shirokii_processed['Dose'] *= 1000
shirokii_processed['Target'] /= 100
shirokii_processed['Surface coating'] = shirokii_processed['Surface coating'].map(lambda x: np.nan if x == '0' else x)
shirokii_processed['Cell origin (primary or cell-line)'] = shirokii_processed['Cell origin (primary or cell-line)']\
    .map({'P': 'Primary', 'L': 'Cell-line'}.get)
shirokii_processed['Source'] = 'Shirokii, 2023'
shirokii_processed

Unnamed: 0,NP Type,Diameter,Zeta potential,Cell name,Dose,Target,Surface coating,Cell origin (primary or cell-line),Cell origin (species),Cell origin (anatomical),Cell origin (organ),Exposure time,Assay,Molecular weight,Electronegativity,Ionic radius,Source
0,Ag,28.41,-33.33,HeLa,0.000356,0.036610,,Cell-line,human,Epithelial,cervix,72.0,MTT,107.8682,1.930000,1.086667,"Shirokii, 2023"
1,Ag,28.41,-33.33,HeLa,0.000028,0.484768,,Cell-line,human,Epithelial,cervix,48.0,NR,107.8682,1.930000,1.086667,"Shirokii, 2023"
2,Ag,28.41,-33.33,HeLa,0.000114,0.659603,,Cell-line,human,Epithelial,cervix,24.0,NR,107.8682,1.930000,1.086667,"Shirokii, 2023"
3,Ag,28.41,-33.33,HaCat,0.000853,0.237134,,Cell-line,human,Keratinocyte,skin,48.0,NR,107.8682,1.930000,1.086667,"Shirokii, 2023"
4,Ag,28.41,-33.33,HaCat,0.000853,0.205212,,Cell-line,human,Keratinocyte,skin,72.0,NR,107.8682,1.930000,1.086667,"Shirokii, 2023"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3082,ZrO2,10.00,-45.50,HCT-116,150.000000,0.526200,,Cell-line,human,Epithelial,colon,24.0,Live/Dead,123.2228,2.736667,0.860000,"Shirokii, 2023"
3083,ZrO2,10.00,-45.50,HCT-116,100.000000,0.585200,,Cell-line,human,Epithelial,colon,24.0,Live/Dead,123.2228,2.736667,0.860000,"Shirokii, 2023"
3084,ZrO2,10.00,-45.50,VERO,100.000000,0.767400,,Cell-line,monkey,Epithelial,kidney,24.0,MTT,123.2228,2.736667,0.860000,"Shirokii, 2023"
3085,ZrO2,10.00,-45.50,HCT-116,50.000000,0.727400,,Cell-line,human,Epithelial,colon,24.0,Live/Dead,123.2228,2.736667,0.860000,"Shirokii, 2023"


In [208]:
# Subramanian, 2021
"""
Np Type *
Core size
Hydrodynamic size
Surface charge	SurfCharge
Surface area
Conduction band energy
Valence band energy
Standard enthalpy of formation
Mulliken electronegativity
Enthalpy of formation of cation
Polarization ratio

Pauling electronegativity
Summation of electronegativity
Molecular weight
Number of oxygen atoms
Number of metal atoms
Ratio of esum to Noxygen
Oxidation state

Exposure time
Dosage
"""

subramanian_raw = pd.read_csv('raw_datasets/subramanian_2021.txt', sep='\t')
subramanian_processed = subramanian_raw.rename(columns={
    'NPs': 'NP Type',
    'coresize': 'Diameter',
    'hydrosize': 'Hydrodynamic size',
    'surfcharge': 'Surface charge',
    'surfarea': 'Surface area',
    'e': 'Electronegativity',
    'Cellline': 'Cell name',
    'Expotime': 'Exposure time',
    'dosage': 'Dose',
    'viability': 'Target'
})[[
    'NP Type', 'Diameter', 'Hydrodynamic size', 'Surface charge', 'Surface area',
    'Electronegativity', 'Cell name', 'Exposure time', 'Dose', 'Target',
]]
subramanian_processed['Surface charge (categorical)'] = \
    subramanian_processed['Surface charge'].map(lambda x: 'Positive' if x > 0.0 else 'Negative')
subramanian_processed['Target'] /= 100
subramanian_processed['Source'] = 'Subramanian, 2021'
subramanian_processed

Unnamed: 0,NP Type,Diameter,Hydrodynamic size,Surface charge,Surface area,Electronegativity,Cell name,Exposure time,Dose,Target,Surface charge (categorical),Source
0,Al2O3,39.7,267.0,36.3,64.7,1.61,HCMEC,24,0.001,0.925258,Positive,"Subramanian, 2021"
1,Al2O3,39.7,267.0,36.3,64.7,1.61,HCMEC,24,0.010,0.961340,Positive,"Subramanian, 2021"
2,Al2O3,39.7,267.0,36.3,64.7,1.61,HCMEC,24,0.100,0.935567,Positive,"Subramanian, 2021"
3,Al2O3,39.7,267.0,36.3,64.7,1.61,HCMEC,24,1.000,0.976804,Positive,"Subramanian, 2021"
4,Al2O3,39.7,267.0,36.3,64.7,1.61,HCMEC,24,5.000,0.948454,Positive,"Subramanian, 2021"
...,...,...,...,...,...,...,...,...,...,...,...,...
478,ZnO,35.6,236.0,-41.6,27.9,1.65,Caco2,48,1.000,1.274363,Negative,"Subramanian, 2021"
479,ZnO,35.6,236.0,-41.6,27.9,1.65,Caco2,48,10.000,1.163751,Negative,"Subramanian, 2021"
480,ZnO,35.6,236.0,-41.6,27.9,1.65,Caco2,48,100.000,0.408796,Negative,"Subramanian, 2021"
481,ZnO,35.6,236.0,-41.6,27.9,1.65,Caco2,24,0.100,0.868566,Negative,"Subramanian, 2021"


## Combination
Now, we concatenate our dataframes into a single dataset, and re-order the columns for visibility.

In [209]:
processed_dfs = [
    furxhi_processed,
    marvin_processed,
    shirokii_processed,
    subramanian_processed,
    bilal_processed,
]
final_df = pd.concat(processed_dfs).rename(columns={'Diameter': 'Size'})
final_df.columns.values

array(['NP Type', 'Dose', 'Exposure time', 'Zeta potential',
       'Hydrodynamic size', 'Size', 'Surface area', 'Shape',
       'Cell origin (species)', 'Cell origin (anatomical)', 'Cell name',
       'Assay', 'Target', 'Source', 'Surface charge', 'Surface coating',
       'Surface reactivity', 'Aggregation',
       'Surface charge (categorical)',
       'Cell origin (primary or cell-line)', 'Cell origin (organ)',
       'Molecular weight', 'Electronegativity', 'Ionic radius',
       'QD Concentration', 'QD Core', 'QD Shell',
       'QD Emission wavelength', 'QD Surface ligand'], dtype=object)

In [210]:
final_df = final_df[[
    'Source',
    'NP Type',
    'Size',
    'Shape',
    'Surface area',
    'Zeta potential',
    'Hydrodynamic size',
    'Molecular weight',

    'Surface charge',
    'Surface charge (categorical)',
    'Surface coating',
    'Surface reactivity',

    'Aggregation',
    'Electronegativity',
    'Ionic radius',

    'QD Concentration',
    'QD Core',
    'QD Shell',
    'QD Emission wavelength',
    'QD Surface ligand',

    'Cell origin (species)',
    'Cell origin (organ)',
    'Cell origin (anatomical)',
    'Cell origin (primary or cell-line)',
    'Cell name',

    'Dose',
    'Exposure time',
    'Assay',

    'Target',
]]
final_df = final_df.copy()
final_df

Unnamed: 0,Source,NP Type,Size,Shape,Surface area,Zeta potential,Hydrodynamic size,Molecular weight,Surface charge,Surface charge (categorical),...,QD Surface ligand,Cell origin (species),Cell origin (organ),Cell origin (anatomical),Cell origin (primary or cell-line),Cell name,Dose,Exposure time,Assay,Target
0,"Furxhi, 2020",CuO,40.0,,,-47.6,,,,,...,,Rat,,Endothelial,,BMEC,10.00,4.0,XTT,0.000
1,"Furxhi, 2020",CuO,60.0,,,-36.6,,,,,...,,Rat,,Endothelial,,BMEC,10.00,4.0,XTT,0.000
2,"Furxhi, 2020",CuO,40.0,,,-47.6,,,,,...,,Rat,,Endothelial,,BMEC,1.56,24.0,XTT,0.000
3,"Furxhi, 2020",CuO,60.0,,,-36.6,,,,,...,,Rat,,Endothelial,,BMEC,1.56,24.0,XTT,0.000
4,"Furxhi, 2020",CuO,40.0,,,-47.6,,,,,...,,Rat,,Endothelial,,BMEC,3.13,24.0,XTT,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3023,"Bilal, 2019",QD,4.9,,,,,,,Zwitterion,...,Aminoacid,Human,Breast,Epithelial,Cell-line,KPL-4,,48.0,MTT,0.426
3024,"Bilal, 2019",QD,5.5,,,,,,,Neutral,...,Lipid,Human,Cervix,Epithelial,Cell-line,HeLa,,24.0,MTT,0.720
3025,"Bilal, 2019",QD,5.5,,,,,,,Neutral,...,Lipid,Human,Cervix,Epithelial,Cell-line,HeLa,,48.0,MTT,0.680
3026,"Bilal, 2019",QD,5.5,,,,,,,Neutral,...,Lipid,Human,Cervix,Epithelial,Cell-line,HeLa,,24.0,MTT,0.730


## Post-processing
We clean up formatting differences and errors in the data for each attribute.

In [211]:
final_df['Shape'] = final_df['Shape'].str.capitalize()
final_df['Shape'] = final_df['Shape'].map(lambda x: {
    'Sphere': 'Spherical',
    'Amorph': 'Irregular',
}.get(x, x))

final_df['Surface area'] = final_df['Surface area'].astype(float)
final_df['Zeta potential'] = final_df['Zeta potential'].astype(float)
final_df['Hydrodynamic size'] = final_df['Hydrodynamic size'].astype(float)

final_df['Surface coating'] = final_df['Surface coating'].map(lambda x: {
    'None': np.nan,
    'PEG to the PEI': 'PEG',
    'folic acid with intermediate inorganic (silica) coating': 'Folic acid',
    'folic acid with intermediate organic (PEG) coating': 'Folic acid'
}.get(x, x))


final_df['Cell origin (species)'] = final_df['Cell origin (species)'].str.capitalize().str.strip()
final_df['Cell origin (species)'] = final_df['Cell origin (species)'].map(lambda x: {
    '0': np.nan,
    'Fusarium_oxysporum': 'Fusarium oxysporum',
    'Mouse-rat': 'Mouse',
}.get(x, x))

final_df['Cell origin (organ)'] = final_df['Cell origin (organ)'].str.capitalize().str.replace('-', ' ').str.strip()
final_df['Cell origin (organ)'] = final_df['Cell origin (organ)'].map(lambda x: {
    'Adipose tissue': 'Connective tissue',
    'Areolar tissue': 'Connective tissue',
    'Alveolar': 'Lung',
    'Aorta': 'Heart',
    'Amniotic membrane': 'Amnion',
    'Cornea': 'Eye',
    'Hypothalmus': 'Brain',
    'Respiratorytract': 'Respiratory tract',
    'Trachea': 'Respiratory tract',
    'Bronchial tracheal': 'Respiratory tract',
    'Lymphocyte': 'Immune system',
    'Nasopharyngeal': 'Pharynx',
    'Nasopharynx': 'Pharynx',
    'Lymph node': 'Lymph',
}.get(x, x))

final_df['Cell origin (anatomical)'] = final_df['Cell origin (anatomical)']\
    .str.capitalize().str.replace('-', ' ').str.replace('_', ' ')\
    .str.strip()
final_df['Cell origin (anatomical)'] = final_df['Cell origin (anatomical)'].map(lambda x: {
    'Monocyte/macrophage': 'Monocyte',
    'Macrophage': 'Monocyte',
    'Keratinocytes': 'Keratinocyte',
    'Astrocytes': 'Astrocyte',
    'Chondrocytes': 'Chondrocyte',
    'Myocardiocytes': 'Myocardiocyte'
}.get(x, x))

final_df['Cell name'] = final_df['Cell name'].map(lambda x: {
    'differntiated-NG108-15': 'NG108-15',
    'undifferntiated-NG108-15': 'NG108-15',
    'T cells (all types)': 'T-cell',
    'Naive T-cell': 'T-cell',
    'Memory T-cell': 'T-cell',
    'HaCat': 'HaCaT',
    'PMA activated THP-1': 'THP-1',
    'bovine-skin-fibroblasts': 'Fibroblast',
    'bovine-corneal-fibroblasts': 'Fibroblast',
    'Fibroblast': 'Fibroblast',
    'Human-Dermal-Fibroblast': 'Fibroblast',
    'Fibroblasts': 'Fibroblast',
    'L-02': 'L02',
    'Caco-2': 'Caco2',
    'Hepatocytes': 'Hepatocyte',
    'SH-SY5Y': 'SHSY5Y',
}.get(x, x))

final_df['Assay'] = final_df['Assay'].str.strip()
final_df['Assay'] = final_df['Assay'].map(lambda x: {
    'Others': np.nan,
    '0': np.nan,
    'ATPLite': 'ATP',
    'Caspase_3/7': 'Caspase',
    'Colonigenic-assay': 'Colonigenic',
    'Trypan-Blue': 'Trypan Blue',
    'WST-1': 'WST',
    'WST-8': 'WST',
    'Live/dead-staining': 'Live/Dead',
    'induction-of-autophagy': 'Induction of Autophagy',
}.get(x, x))

## Save to File

In [212]:
final_df.to_csv('compiled_datasets/complete_nanotox_dataset.csv', index=False)

In [222]:
final_df.groupby('Cell name').count().sort_values('Source', ascending=False)

Unnamed: 0_level_0,Source,NP Type,Size,Shape,Surface area,Zeta potential,Hydrodynamic size,Molecular weight,Surface charge,Surface charge (categorical),...,QD Emission wavelength,QD Surface ligand,Cell origin (species),Cell origin (organ),Cell origin (anatomical),Cell origin (primary or cell-line),Dose,Exposure time,Assay,Target
Cell name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A549,678,678,678,0,132,395,132,395,132,283,...,118,151,546,546,546,546,527,678,543,678
SHSY5Y,658,658,658,96,393,449,403,213,167,191,...,22,24,491,237,491,237,634,658,491,658
HeLa,473,473,473,0,0,121,0,121,0,352,...,330,352,473,473,473,473,121,473,471,465
HepG2,360,360,360,0,0,123,0,123,0,237,...,204,237,360,360,360,360,123,360,360,354
HCMEC,354,354,354,0,72,282,72,282,72,72,...,0,0,282,282,282,282,354,354,281,354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CFSC-2G,1,1,1,0,0,0,0,0,0,1,...,1,1,1,1,1,1,0,1,1,1
Rat-Hippocampal-Neurons,1,1,1,0,0,0,0,0,0,1,...,0,1,1,1,1,1,0,1,1,1
S-180,1,1,1,0,0,0,0,0,0,1,...,1,1,1,1,1,1,0,1,1,1
BcaCD885,1,1,1,0,0,0,0,0,0,1,...,1,1,1,1,1,1,0,1,1,1
