In [55]:
import os

import pandas as pd

from Configurations.config import Config, config
from Utils.verbose_logger import v_logger

from Utils.dataframe_funcs import MedianDedupedDF

v_logger.UpdateFormat("PubChem_toxicity", "fg #FAC11C")

toxicity_config: Config = config["PubChem_download_toxicity"]

filtering_folder_name: str = f"{toxicity_config["results_folder_name"]}/filtering"
os.makedirs(filtering_folder_name, exist_ok=True)

# KG

In [56]:
kg_df = pd.read_csv(f"{toxicity_config["results_folder_name"]}/{toxicity_config["combined_file_name"]}_kg.csv",
sep=config["csv_separator"], low_memory=False)

unique_organisms = kg_df["organism"].unique()
v_logger.info(f"Unique organism: {unique_organisms}")

unique_routes = kg_df["route"].unique()
v_logger.info(f"Unique route: {unique_routes}")

kg_df

[13.03.2025 01:56:34] [38;2;250;193;28mPubChem_toxicity:[0m Unique organism: ['mouse' 'rat']                                              [[1mINFO[0m]
[13.03.2025 01:56:34] [38;2;250;193;28mPubChem_toxicity:[0m Unique route: ['subcutaneous' 'intraperitoneal' 'oral' 'intravenous']         [[1mINFO[0m]


Unnamed: 0,cid,sid,sourceid,organism,testtype,route,reference,mw,dose,dose_units,pLD50,effect,time_period
0,19081,134984244,3548105,mouse,LD50,subcutaneous,"British Journal of Experimental Pathology., 28...",208.26,130.0,mg/kg,6.204663,,
1,19083,134984246,3553808,mouse,LD50,subcutaneous,Australian Journal of Experimental Biology and...,145.20,680.0,mg/kg,5.329458,BEHAVIORAL: SOMNOLENCE (GENERAL DEPRESSED ACTI...,
2,19239,134984251,3619178,mouse,LD50,intraperitoneal,Journal of Medicinal and Pharmaceutical Chemis...,102.14,767.0,mg/kg,5.124400,,
3,19240,134984252,3622762,rat,LD50,oral,"Toxicology and Applied Pharmacology., 28(313),...",115.17,180.0,mg/kg,5.806067,,
4,19242,134984253,3623050,mouse,LDLo,subcutaneous,Archiv fuer Experimentelle Pathologie und Phar...,127.18,230.0,mg/kg,5.742691,BEHAVIORAL: CONVULSIONS OR EFFECT ON SEIZURE T...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
108623,3078298,135225175,120602280,mouse,LD50,intraperitoneal,Acta Poloniae Pharmaceutica. For English tran...,214.25,375.0,mg/kg,5.756890,,
108624,16684679,135225176,120626860,mouse,LD50,intraperitoneal,European Journal of Medicinal Chemistry--Chimi...,332.10,1000.0,mg/kg,5.521269,,
108625,16684680,135225177,120626871,mouse,LD50,intraperitoneal,European Journal of Medicinal Chemistry--Chimi...,281.94,800.0,mg/kg,5.547067,,
108626,3078453,135225178,121221054,mouse,LD50,intraperitoneal,"Pharmaceutical Chemistry Journal, 22(353), 1988",444.50,103.0,mg/kg,6.635035,,


In [57]:
unique_separated: dict[str, dict[str, pd.DataFrame]] = {}

for u_organism, organism_df in kg_df.groupby("organism"):
    unique_separated.setdefault(str(u_organism), {})

    for u_route, route_df in organism_df.groupby("route"):
        unique_separated[str(u_organism)][str(u_route)] = route_df.copy()

In [58]:
unique_test_types = kg_df["testtype"].unique()
v_logger.info(unique_test_types)

unique_filtered: dict[str, dict[str, dict[str, pd.DataFrame]]] = {}

for u_organism in unique_organisms:
  v_logger.info()
  v_logger.info(f"organism: {u_organism}")

  unique_filtered.setdefault(u_organism, {})

  for u_route in unique_routes:
    v_logger.info(f"    route: {u_route}")

    unique_filtered[u_organism].setdefault(u_route, {})

    df = unique_separated[u_organism][u_route]

    for u_test_type in unique_test_types:
      filtered_df = df[df["testtype"] == u_test_type].copy()
      filtered_df = MedianDedupedDF(filtered_df, "sid", "pLD50")

      if len(filtered_df) >= 100:
        unique_filtered[u_organism][u_route][u_test_type] = filtered_df

        os.makedirs(f"{filtering_folder_name}/kg", exist_ok=True)
        filtered_df.to_csv(f"{filtering_folder_name}/kg/{toxicity_config["results_file_name"]}_{u_organism}_{u_route}_{u_test_type}.csv")

        v_logger.info(f"      test_type: {u_test_type}, len: {len(unique_filtered[u_organism][u_route][u_test_type])}")


[13.03.2025 01:56:34] [38;2;250;193;28mPubChem_toxicity:[0m ['LD50' 'LDLo' 'LD20' 'LD10' 'LD70' 'LD80' 'LD40' 'LD60' 'LD95' 'LD16'
 'LD05' 'LD25' 'LD75' 'LD17' 'TDLo' 'LD90' 'LD30' 'LD55' 'LD12' 'LD54'] [[1mINFO[0m]
[13.03.2025 01:56:34] [38;2;250;193;28mPubChem_toxicity:[0m ----------------------------------------------------------------------------- [[1mINFO[0m]
[13.03.2025 01:56:34] [38;2;250;193;28mPubChem_toxicity:[0m organism: mouse                                                               [[1mINFO[0m]
[13.03.2025 01:56:34] [38;2;250;193;28mPubChem_toxicity:[0m     route: subcutaneous                                                       [[1mINFO[0m]
[13.03.2025 01:56:37] [38;2;250;193;28mPubChem_toxicity:[0m       test_type: LD50, len: 7039                                              [[1mINFO[0m]
[13.03.2025 01:56:38] [38;2;250;193;28mPubChem_toxicity:[0m       test_type: LDLo, len: 983                                               [[1mINFO[0m]
[13.0

# M3

In [59]:
m3_df = pd.read_csv(f"{toxicity_config["results_folder_name"]}/{toxicity_config["combined_file_name"]}_m3.csv",
sep=config["csv_separator"], low_memory=False)