In [121]:
import os
import pandas as pd
import re
import numpy as np
# Load and prepare data
collecTRI_raw = pd.read_csv("CollecTRI_source.tsv", sep="\t", dtype="str")
collecTRI_raw.head()
# Unify column names across resources
collecTRI_raw.columns = (
    collecTRI_raw.columns.str.replace(r" ", "_", regex=True)
    .str.replace(r"[", "", regex=True).str.replace(r"]", "", regex=True)
    .str.replace(r"(", "", regex=True).str.replace(r")", "", regex=True)
    .str.replace(r"SIGNOR_Effect", "SIGNOR_effectType")
    .str.replace(r"Effect", "Regulation")
    .str.replace(r"Sign", "Regulation")
    .str.replace(r"Activation/Repression", "Regulation", regex=True)
    .str.replace(r"Mode_of_action", "Regulation", regex=True)
    .str.replace(r":", "_")
)
 
collecTRI_raw.set_index("TF_TG", inplace=True)
# Filter for dataframe containing only PMIDs and associated modes of regulation
df_merge = pd.merge(collecTRI_raw.filter(regex="PMID"), collecTRI_raw.filter(regex="Regulation"), left_index=True, right_index=True)

df_merge.reset_index(inplace=True)
# Homogenize regulation modes
# NOTE: This does not work everywhere so perhaps do it later
df_merge.replace([r'\+', r'activation', r"Activation", r"positive", r"Stimulate", r"UP"], 'activation', inplace=True, regex=True)
df_merge.replace([r"\-", r"DOWN", r"Inhibit", r"negative",r"Repression"], r"repression", inplace=True, regex=True)
df_merge.replace([r"", r"\?", r"\+_-", r"not_applicable",r"Unknown"], "unknown", inplace=True, regex=True)
df_merge = df_merge[:3]
 

In [122]:
# Define columns containing PMID and columns containing Regulation
## NOTE: this code can be used already before when merging dataframes for PMID and for Regulation

pmid_cols_ls = df_merge.columns[df_merge.columns.str.contains("PMID")].tolist()
reg_cols_ls = df_merge.columns[df_merge.columns.str.contains("Regulation")].tolist()
 
# Melt dataframes independently, and then merge them
# Melt pmid df
df_pmid = pd.melt(df_merge, id_vars= "TF_TG", value_vars= pmid_cols_ls, var_name="resource", value_name="PMID")
df_pmid["resource"] = [txt.replace("_PMID", "") for txt in df_pmid["resource"]]
df_pmid["PMID"] = [str(txt).replace(";", ",") for txt in df_pmid["PMID"]]
df_pmid[df_pmid.TF_TG=="MYC:TERT"].sort_values("resource")
# Melt regulation df
df_reg = pd.melt(df_merge, id_vars=['TF_TG'], value_vars= reg_cols_ls, var_name="resource", value_name="regulation")
df_reg["resource"] = [txt.replace("_Regulation", "") for txt in df_reg["resource"]]
# Merge into final dataframe
TF_TG_df = pd.merge(df_pmid, df_reg, how="left", on=["TF_TG", "resource"])

In [123]:

TF_TG_df.replace("nan", np.nan, inplace=True)
TF_TG_df.replace(np.nan, "NA", inplace=True)

In [124]:
TF_TG_df[TF_TG_df.TF_TG=="MYC:TERT"]

Unnamed: 0,TF_TG,resource,PMID,regulation
0,MYC:TERT,ExTRI,"10022128,10022128,10022128,10491298,10491298,1...",
3,MYC:TERT,HTRI,11274400|12695333,
6,MYC:TERT,TRRUST,"10022128,15595642,18226852,18754863,21132266,2...",activation|unknown
9,MYC:TERT,TFactS,14519204,activation
12,MYC:TERT,GOA,,activation
15,MYC:TERT,IntAct,,
18,MYC:TERT,SIGNOR,,
21,MYC:TERT,CytReg,,
24,MYC:TERT,GEREDB,,
27,MYC:TERT,NTNU_Curated,11606399|11916966|||,activation|repression|||


In [125]:
#PMID
pmid_fill=TF_TG_df['PMID'].str.split('|', expand=True)
pmid_fill.replace([r"\,"],";", regex=True, inplace=True)
pmid_fill['PMID'] = pd.Series(pmid_fill.fillna('NA').values.tolist()).str.join(',')
pmid_fill=pmid_fill[['PMID']].reset_index()
#regulations
reg_fill=TF_TG_df['regulation'].str.split('|', expand=True)
reg_fill.replace([r"\,"],";", regex=True, inplace=True)
reg_fill['regulation'] = pd.Series(reg_fill.fillna('NA').values.tolist()).str.join(',')
reg_fill=reg_fill[['regulation']].reset_index()

In [128]:
merge=pd.merge(pmid_fill,reg_fill)
TF_TG_df=TF_TG_df.reset_index()
TF_TG_df.columns=['index', 'TF_TG', 'resource', 'PMID_old', 'regulation_old']
TF_TG_df.head()
new=pd.merge(TF_TG_df,merge)
new

Unnamed: 0,index,TF_TG,resource,PMID_old,regulation_old,PMID,regulation
0,0,MYC:TERT,ExTRI,"10022128,10022128,10022128,10491298,10491298,1...",,10022128;10022128;10022128;10491298;10491298;1...,"NA,NA,NA,NA,NA"
1,1,AP1:CAT,ExTRI,"10022519,10329043,12036993,12538496,17935786,7...",,10022519;10329043;12036993;12538496;17935786;7...,"NA,NA,NA,NA,NA"
2,2,SPI1:BGLAP,ExTRI,10022617,,"10022617,NA,NA,NA,NA","NA,NA,NA,NA,NA"
3,3,MYC:TERT,HTRI,11274400|12695333,,"11274400,12695333,NA,NA,NA","NA,NA,NA,NA,NA"
4,4,AP1:CAT,HTRI,,,"NA,NA,NA,NA,NA","NA,NA,NA,NA,NA"
5,5,SPI1:BGLAP,HTRI,,,"NA,NA,NA,NA,NA","NA,NA,NA,NA,NA"
6,6,MYC:TERT,TRRUST,"10022128,15595642,18226852,18754863,21132266,2...",activation|unknown,10022128;15595642;18226852;18754863;21132266;2...,"activation,unknown,NA,NA,NA"
7,7,AP1:CAT,TRRUST,,,"NA,NA,NA,NA,NA","NA,NA,NA,NA,NA"
8,8,SPI1:BGLAP,TRRUST,,,"NA,NA,NA,NA,NA","NA,NA,NA,NA,NA"
9,9,MYC:TERT,TFactS,14519204,activation,"14519204,NA,NA,NA,NA","activation,NA,NA,NA,NA"


In [129]:
new = new.assign(PMID=new.PMID.str.split(","))
new = new.assign(regulation=new.regulation.str.split(","))
new=new.explode(['PMID','regulation'])


In [135]:
new=new[((new['PMID']!='NA') & (new['regulation']!='NA'))]

In [138]:
new=new[((new['PMID']!='') & (new['regulation']!=''))]


In [139]:
new[new.TF_TG=="MYC:TERT"]

Unnamed: 0,index,TF_TG,resource,PMID_old,regulation_old,PMID,regulation
6,6,MYC:TERT,TRRUST,"10022128,15595642,18226852,18754863,21132266,2...",activation|unknown,10022128;15595642;18226852;18754863;21132266;2...,activation
6,6,MYC:TERT,TRRUST,"10022128,15595642,18226852,18754863,21132266,2...",activation|unknown,10637317;12941894;14611815;15958520;17706770;1...,unknown
9,9,MYC:TERT,TFactS,14519204,activation,14519204,activation
27,27,MYC:TERT,NTNU_Curated,11606399|11916966|||,activation|repression|||,11606399,activation
27,27,MYC:TERT,NTNU_Curated,11606399|11916966|||,activation|repression|||,11916966,repression
30,30,MYC:TERT,Pavlidis2021,21627565|10022128|16880523|10022128,unknown|activation|activation|activation,21627565,unknown
30,30,MYC:TERT,Pavlidis2021,21627565|10022128|16880523|10022128,unknown|activation|activation|activation,10022128,activation
30,30,MYC:TERT,Pavlidis2021,21627565|10022128|16880523|10022128,unknown|activation|activation|activation,16880523,activation
30,30,MYC:TERT,Pavlidis2021,21627565|10022128|16880523|10022128,unknown|activation|activation|activation,10022128,activation
33,33,MYC:TERT,DoRothEA_A,1127440012695333,activation,11274400;12695333,activation
