In [2]:
"""Code for Safety Module."""

import glob
from typing import List, Optional

import pandas as pd
import numpy as np
from loguru import logger

from tde import target
from tde.target.utils import read_parquet

# Define directories in use
RAW_DATA_DIR = "/home/owkin/project/target/raw_data"
PROCESS_DATA_DIR = "/home/owkin/project/target/processed_data"


def _reformat_ot_safety(df: pd.DataFrame, col_explode: str, cols_subset_ls: List[str]) -> pd.DataFrame:
    """Create a dataframe by exploding safetyLiabilities column."""
    # Create temporary dataframes to explode lists of dictionaries and flatten out values of interest into columns
    df_temp1 = df.explode(col_explode)
    # Drop rows with missing values in safetyLiabilities
    df_temp1.dropna(subset=[col_explode], inplace=True)
    df_temp1.reset_index(inplace=True)

    df_temp2 = df_temp1[col_explode].apply(pd.Series, dtype="object")
    df_temp3 = df_temp2.explode("biosamples")
    df_temp4 = df_temp3["biosamples"].apply(pd.Series, dtype="object")[["cellLabel", "tissueLabel"]]


    # Concatenate all temporary dataframes into one
    df_concat = pd.concat([df_temp1, df_temp2, df_temp4], axis=1)

    # Subset for columns of interest
    df_subset = df_concat[cols_subset_ls]

    return df_subset



In [3]:
# Working directory
!pwd

/home/owkin/target_discovery_engine/tde/target


In [4]:

col_explode = "safetyLiabilities"
cols_subset_ls = ["gene_id", "cellLabel", "tissueLabel", "event", "datasource"]

files_ls = glob.glob(f"{RAW_DATA_DIR}/targets/*.parquet")

# Read in all parquet files into single dataframe
parquet_df_ls = [
    read_parquet(file_path=file, cols_subset_ls=["gene_id", "tractability", "safetyLiabilities"])
    for file in files_ls
]

df = pd.concat(parquet_df_ls, ignore_index=True)


In [5]:
df[df["safetyLiabilities"].notna()]

Unnamed: 0,gene_id,tractability,safetyLiabilities
13,ENSG00000105641,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'Cognitive Function, Decreased', 'e..."
31,ENSG00000138823,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'Increased, Liver Steatosis', 'even..."
45,ENSG00000163586,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...",[{'event': 'regulation of transcription factor...
55,ENSG00000169410,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'regulation of catalytic activity',..."
138,ENSG00000232810,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'regulation of gene expression', 'e..."
...,...,...,...
62070,ENSG00000163285,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'increased eating', 'eventId': 'HP_..."
62354,ENSG00000050748,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'regulation of catalytic activity',..."
62367,ENSG00000111087,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...",[{'event': 'regulation of transcription factor...
62432,ENSG00000181072,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'peripheral edema', 'eventId': None..."


In [7]:
# Reformat dataframe using _reformat_ot_safety function
safety_df = _reformat_ot_safety(df, col_explode ,cols_subset_ls)
safety_df

Unnamed: 0,gene_id,cellLabel,tissueLabel,event,datasource
0,ENSG00000105641,,,"Cognitive Function, Decreased",AOP-Wiki
1,ENSG00000105641,,,"Altered, Amphibian metamorphosis",AOP-Wiki
2,ENSG00000105641,,,"Increase, Adenomas/carcinomas (follicular cell)",AOP-Wiki
3,ENSG00000105641,HEK293T,,regulation of transporter activity,ToxCast
4,ENSG00000105641,,,"Cognitive Function, Decreased",AOP-Wiki
...,...,...,...,...,...
2584,ENSG00000181072,,cardiovascular,increased/decreased blood pressure,Lynch et al. (2017)
2585,ENSG00000181072,,nervous system,irritability,Lynch et al. (2017)
2585,ENSG00000181072,,respiratory,irritability,Lynch et al. (2017)
2585,ENSG00000181072,,cardiovascular,irritability,Lynch et al. (2017)


In [8]:
#genes with not empty safety events
safety_genes=df["gene_id"][df["safetyLiabilities"].notna()]

In [10]:
# Choose a gene that has multiple entries
gene = "ENSG00000105641"
safety_df[safety_df["gene_id"]==gene]

Unnamed: 0,gene_id,cellLabel,tissueLabel,event,datasource
0,ENSG00000105641,,,"Cognitive Function, Decreased",AOP-Wiki
1,ENSG00000105641,,,"Altered, Amphibian metamorphosis",AOP-Wiki
2,ENSG00000105641,,,"Increase, Adenomas/carcinomas (follicular cell)",AOP-Wiki
3,ENSG00000105641,HEK293T,,regulation of transporter activity,ToxCast
4,ENSG00000105641,,,"Cognitive Function, Decreased",AOP-Wiki
5,ENSG00000105641,,,"Impairment, Learning and memory",AOP-Wiki


In [11]:
from pprint import pprint
from numpy import array

In [13]:
# Gene entry in input dataframe (merge of parquet files)
df["safetyLiabilities"][df["gene_id"]==gene]

13    [{'event': 'Cognitive Function, Decreased', 'e...
Name: safetyLiabilities, dtype: object

In [14]:
# Expanded information inside safetyLiabilities column
pprint(df["safetyLiabilities"][df["gene_id"]==gene].tolist())

[array([{'event': 'Cognitive Function, Decreased', 'eventId': 'HP_0100543', 'effects': array([{'direction': 'inhibition', 'dosing': None}], dtype=object), 'biosamples': None, 'isHumanApplicable': None, 'datasource': 'AOP-Wiki', 'literature': None, 'url': 'https://aopwiki.org/aops/134', 'studies': None},
       {'event': 'Altered, Amphibian metamorphosis', 'eventId': None, 'effects': array([{'direction': 'inhibition', 'dosing': None}], dtype=object), 'biosamples': None, 'isHumanApplicable': False, 'datasource': 'AOP-Wiki', 'literature': None, 'url': 'https://aopwiki.org/aops/176', 'studies': None},
       {'event': 'Increase, Adenomas/carcinomas (follicular cell)', 'eventId': 'EFO_0000616', 'effects': array([{'direction': 'inhibition', 'dosing': None}], dtype=object), 'biosamples': None, 'isHumanApplicable': False, 'datasource': 'AOP-Wiki', 'literature': None, 'url': 'https://aopwiki.org/aops/110', 'studies': None},
       {'event': 'regulation of transporter activity', 'eventId': None,

In [15]:
# Keep only two entries to build test dataframe
varList=df["safetyLiabilities"][df["gene_id"]==gene].tolist()[0].tolist()[2:4]
varList

[{'event': 'Increase, Adenomas/carcinomas (follicular cell)',
  'eventId': 'EFO_0000616',
  'effects': array([{'direction': 'inhibition', 'dosing': None}], dtype=object),
  'biosamples': None,
  'isHumanApplicable': False,
  'datasource': 'AOP-Wiki',
  'literature': None,
  'url': 'https://aopwiki.org/aops/110',
  'studies': None},
 {'event': 'regulation of transporter activity',
  'eventId': None,
  'effects': None,
  'biosamples': array([{'cellFormat': 'cell line', 'cellLabel': 'HEK293T', 'tissueId': None, 'tissueLabel': None}],
        dtype=object),
  'isHumanApplicable': None,
  'datasource': 'ToxCast',
  'literature': None,
  'url': 'https://www.epa.gov/chemical-research/exploring-toxcast-data-downloadable-data',
  'studies': array([{'description': 'NIS_RAIU, is one of 2 assay component(s) measured or calculated from the NIS_Inhibition assay. It is designed to make measurements of enzyme activity, a form of enzyme reporter, as detected with 125-i signals by MicroBeta radioactivit

In [20]:
# Manual construction of varlist
varList= [{'event': 'Increase, Adenomas/carcinomas (follicular cell)',
  'eventId': 'EFO_0000616',
  'effects': array([{'direction': 'inhibition', 'dosing': None}], dtype=object),
  'biosamples': None,
  'isHumanApplicable': False,
  'datasource': 'AOP-Wiki',
  'literature': None,
  'url': 'https://aopwiki.org/aops/110',
  'studies': None},
 {'event': 'regulation of transporter activity',
  'eventId': None,
  'effects': None,
  'biosamples': array([{'cellFormat': 'cell line', 'cellLabel': 'HEK293T', 'tissueId': None, 'tissueLabel': None}],
        dtype=object),
  'isHumanApplicable': None,
  'datasource': 'ToxCast',
  'literature': None,
  'url': 'https://www.epa.gov/chemical-research/exploring-toxcast-data-downloadable-data',
  'studies': array([{'description': 'NIS_RAIU, is one of 2 assay component(s) measured or calculated from the NIS_Inhibition assay. It is designed to make measurements of enzyme activity, a form of enzyme reporter, as detected with 125-i signals by MicroBeta radioactivity plate reader technology.', 'name': 'NIS_RAIU_inhibition', 'type': None}],
        dtype=object)}]

In [22]:
test = pd.DataFrame({"gene_id": gene, "tractability": [None], 
"safetyLiabilities":[varList]})
test

Unnamed: 0,gene_id,tractability,safetyLiabilities
0,ENSG00000105641,,"[{'event': 'Increase, Adenomas/carcinomas (fol..."


In [23]:
# Original input dataframe
df[df.gene_id==gene]

Unnamed: 0,gene_id,tractability,safetyLiabilities
13,ENSG00000105641,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'Cognitive Function, Decreased', 'e..."


In [24]:
expected = pd.DataFrame({"gene_id": ["ENSG00000163586", "ENSG00000163586"], 
        "cellLabel": [np.nan, "HepaRG"],
        "tissueLabel": [np.nan, np.nan],
        "event": ["Increase, Adenomas/carcinomas (follicular cell)", "regulation of transcription factor activity"],
        "datasource": ["AOP-Wiki", "ToxCast"]})
expected


Unnamed: 0,gene_id,cellLabel,tissueLabel,event,datasource
0,ENSG00000163586,,,"Increase, Adenomas/carcinomas (follicular cell)",AOP-Wiki
1,ENSG00000163586,HepaRG,,regulation of transcription factor activity,ToxCast


In [25]:
df_temp1=test.explode("safetyLiabilities")
df_temp1.dropna(subset=[col_explode], inplace=True)
df_temp1.reset_index(inplace=True)
df_temp1.head()


Unnamed: 0,index,gene_id,tractability,safetyLiabilities
0,0,ENSG00000105641,,"{'event': 'Increase, Adenomas/carcinomas (foll..."
1,0,ENSG00000105641,,{'event': 'regulation of transporter activity'...


In [26]:
test2=df[df["gene_id"]==gene]
test2.explode(col_explode)
test2.reset_index(inplace=True)
test2

Unnamed: 0,index,gene_id,tractability,safetyLiabilities
0,13,ENSG00000105641,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'Cognitive Function, Decreased', 'e..."


In [27]:
test2[col_explode].apply(pd.Series, dtype="object")

Unnamed: 0,0,1,2,3,4,5
0,"{'event': 'Cognitive Function, Decreased', 'ev...","{'event': 'Altered, Amphibian metamorphosis', ...","{'event': 'Increase, Adenomas/carcinomas (foll...",{'event': 'regulation of transporter activity'...,"{'event': 'Cognitive Function, Decreased', 'ev...","{'event': 'Impairment, Learning and memory', '..."


In [28]:
df_temp2 = df_temp1[col_explode].apply(pd.Series, dtype="object")
df_temp2.head()

Unnamed: 0,event,eventId,effects,biosamples,isHumanApplicable,datasource,literature,url,studies
0,"Increase, Adenomas/carcinomas (follicular cell)",EFO_0000616,"[{'direction': 'inhibition', 'dosing': None}]",,False,AOP-Wiki,,https://aopwiki.org/aops/110,
1,regulation of transporter activity,,,"[{'cellFormat': 'cell line', 'cellLabel': 'HEK...",,ToxCast,,https://www.epa.gov/chemical-research/explorin...,"[{'description': 'NIS_RAIU, is one of 2 assay ..."


In [30]:
df_temp3 = df_temp2.explode("biosamples")
df_temp3

Unnamed: 0,event,eventId,effects,biosamples,isHumanApplicable,datasource,literature,url,studies
0,"Increase, Adenomas/carcinomas (follicular cell)",EFO_0000616,"[{'direction': 'inhibition', 'dosing': None}]",,False,AOP-Wiki,,https://aopwiki.org/aops/110,
1,regulation of transporter activity,,,"{'cellFormat': 'cell line', 'cellLabel': 'HEK2...",,ToxCast,,https://www.epa.gov/chemical-research/explorin...,"[{'description': 'NIS_RAIU, is one of 2 assay ..."


In [31]:
df_temp3["biosamples"].apply(pd.Series, dtype="object")

Unnamed: 0,cellFormat,cellLabel,tissueId,tissueLabel
0,,,,
1,cell line,HEK293T,,


In [32]:
# Drop rows with missing values in safetyLiabilities

df_temp2 = df_temp1[col_explode].apply(pd.Series, dtype="object")
df_temp3 = df_temp2.explode("biosamples")
df_temp4 = df_temp3["biosamples"].apply(pd.Series, dtype="object")[["cellLabel", "tissueLabel"]]


# Concatenate all temporary dataframes into one
df_concat = pd.concat([df_temp1, df_temp2, df_temp4], axis=1)

# Subset for columns of interest
df_subset = df_concat[cols_subset_ls]
df_subset


Unnamed: 0,gene_id,cellLabel,tissueLabel,event,datasource
0,ENSG00000105641,,,"Increase, Adenomas/carcinomas (follicular cell)",AOP-Wiki
1,ENSG00000105641,HEK293T,,regulation of transporter activity,ToxCast


In [33]:
# Compare to expected df
expected

Unnamed: 0,gene_id,cellLabel,tissueLabel,event,datasource
0,ENSG00000163586,,,"Increase, Adenomas/carcinomas (follicular cell)",AOP-Wiki
1,ENSG00000163586,HepaRG,,regulation of transcription factor activity,ToxCast
