In [1]:
"""Code for Safety Module."""

import glob
from typing import List, Optional

import pandas as pd
from loguru import logger

from tde import target
from tde.target.utils import read_parquet

# Define directories in use
RAW_DATA_DIR = "/home/owkin/project/target/raw_data"
PROCESS_DATA_DIR = "/home/owkin/project/target/processed_data"


def _reformat_ot_safety(df: pd.DataFrame, col_explode: str, cols_subset_ls: List[str]) -> pd.DataFrame:
    """Create a dataframe by exploding safetyLiabilities column."""
    # Create temporary dataframes to explode lists of dictionaries and flatten out values of interest into columns
    df_temp1 = df.explode(col_explode)
    # Drop rows with missing values in safetyLiabilities
    df_temp1.dropna(subset=[col_explode], inplace=True)
    df_temp1.reset_index(inplace=True)

    df_temp2 = df_temp1[col_explode].apply(pd.Series, dtype="object")
    df_temp3 = df_temp2.explode("biosamples")
    df_temp4 = df_temp3["biosamples"].apply(pd.Series, dtype="object")[["cellLabel", "tissueLabel"]]

    # Concatenate all temporary dataframes into one
    df_concat = pd.concat([df_temp1, df_temp2, df_temp4], axis=1)

    # Subset for columns of interest
    df_subset = df_concat[cols_subset_ls]

    return df_subset



In [5]:

col_explode = "safetyLiabilities"
cols_subset_ls = ["gene_id", "cellLabel", "tissueLabel", "event", "datasource"]

files_ls = glob.glob(f"{RAW_DATA_DIR}/targets/*.parquet")

# Read in all parquet files into single dataframe
parquet_df_ls = [
    read_parquet(file_path=file, cols_subset_ls=["gene_id", "tractability", "safetyLiabilities"])
    for file in files_ls
]

df = pd.concat(parquet_df_ls, ignore_index=True)


In [74]:
df[df["safetyLiabilities"].notna()]

Unnamed: 0,gene_id,tractability,safetyLiabilities
13,ENSG00000105641,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'Cognitive Function, Decreased', 'e..."
31,ENSG00000138823,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'Increased, Liver Steatosis', 'even..."
45,ENSG00000163586,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...",[{'event': 'regulation of transcription factor...
55,ENSG00000169410,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'regulation of catalytic activity',..."
138,ENSG00000232810,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'regulation of gene expression', 'e..."
...,...,...,...
62070,ENSG00000163285,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'increased eating', 'eventId': 'HP_..."
62354,ENSG00000050748,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'regulation of catalytic activity',..."
62367,ENSG00000111087,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...",[{'event': 'regulation of transcription factor...
62432,ENSG00000181072,"[{'modality': 'SM', 'id': 'Approved Drug', 'va...","[{'event': 'peripheral edema', 'eventId': None..."


In [6]:
safety_df = _reformat_ot_safety(df, col_explode ,cols_subset_ls)



In [17]:
#genes with not empty safety events
safety_genes=df["gene_id"][df["safetyLiabilities"].notna()]

In [34]:
gene=safety_genes.values[1]

In [None]:
safety_df[safety_df["gene_id"]==gene]

In [57]:
from pprint import pprint
from numpy import array

In [55]:
pprint(df["safetyLiabilities"][df["gene_id"]==gene].tolist())

[array([{'event': 'Increased, Liver Steatosis', 'eventId': 'HP_0001397', 'effects': array([{'direction': 'activation', 'dosing': None}], dtype=object), 'biosamples': None, 'isHumanApplicable': False, 'datasource': 'AOP-Wiki', 'literature': None, 'url': 'https://aopwiki.org/aops/61', 'studies': None}],
      dtype=object)]


In [82]:
varList=[array([{'event': 'Increased, Liver Steatosis', 'eventId': 'HP_0001397', 'effects': array([{'direction': 'activation', 'dosing': None}], dtype=object), 'biosamples': None, 'isHumanApplicable': False, 'datasource': 'AOP-Wiki', 'literature': None, 'url': 'https://aopwiki.org/aops/61', 'studies': None}],
      dtype=object)]

In [93]:
test = pd.DataFrame({"gene_id": gene, "tractability": [None], "safetyLiabilities": [varList]})
  

In [72]:
test

Unnamed: 0,gene_id,tractability,safetyLiabilities
0,ENSG00000138823,,"[{'event': 'Increased, Liver Steatosis', 'even..."


In [70]:
expected = pd.DataFrame({"gene_id": ["ENSG00000138823"], 
        "cellLabel": None,
        "tissueLabel": None,
        "event": "Increased, Liver Steatosis",
        "datasource": "AOP-Wiki"})



In [71]:
expected

Unnamed: 0,gene_id,cellLabel,tissueLabel,event,datasource
0,ENSG00000138823,,,"Increased, Liver Steatosis",AOP-Wiki


In [95]:
df_temp1=test.explode("safetyLiabilities")
df_temp1.dropna(subset=[col_explode], inplace=True)
df_temp1.reset_index(inplace=True)


In [111]:
test2=df[df["gene_id"]==gene]
test2.explode(col_explode)
test2.reset_index(inplace=True)

In [112]:
test2[col_explode].apply(pd.Series, dtype="object")

Unnamed: 0,0
0,"{'event': 'Increased, Liver Steatosis', 'event..."


In [98]:
df_temp2

Unnamed: 0,0
0,"{'event': 'Increased, Liver Steatosis', 'event..."


In [97]:
df_temp3 = df_temp2.explode("biosamples")

KeyError: 'biosamples'

In [92]:
# Drop rows with missing values in safetyLiabilities


df_temp2 = df_temp1[col_explode].apply(pd.Series, dtype="object")
df_temp3 = df_temp2.explode("biosamples")
df_temp4 = df_temp3["biosamples"].apply(pd.Series, dtype="object")[["cellLabel", "tissueLabel"]]

# Concatenate all temporary dataframes into one
df_concat = pd.concat([df_temp1, df_temp2, df_temp4], axis=1)

# Subset for columns of interest
df_subset = df_concat[cols_subset_ls]


KeyError: "None of [Index(['cellLabel', 'tissueLabel'], dtype='object')] are in the [columns]"