# Get List of TF NCBI IDs 
Get the list of IDs that will be considered TF, both from GO terms and from TFCheckpoint.

The GO terms and columns from TFCheckpoint used for each of the TF types is detailed in the cell below. The specific procedure used is explained in their respective sections:  [Get GO terms](#get-go-terms) and [GET TFCheckpoint terms](#get-tfcheckpoint-terms)

In [20]:
__import__('sys').path.append('../common/'); __import__('notebook_utils').table_of_contents('get_NCBI_TF_IDs.ipynb')

<h3>Table of contents</h3>


[Get List of TF NCBI IDs](#Get-List-of-TF-NCBI-IDs)
- [Setup](#Setup)
- [Get GO terms](#Get-GO-terms)
- [Get TFCheckpoint terms](#Get-TFCheckpoint-terms)
- [Get final TF set for the pipeline](#Get-final-TF-set-for-the-pipeline)
- [Create final TF table](#Create-final-TF-table)

## Setup

In [21]:
# IMPORTS
import pandas as pd
from IPython.display import display, HTML
import requests

from Bio import Entrez
# *Always* tell NCBI who you are
Entrez.email = "example24@gmail.com"

import sys
sys.path.append('../common')
from notebook_utils import h3, h4, h5, md

In [22]:
# GO TERM & TFCHECKPOINT VARIABLES

# GO terms used:
GO_dbTF = ["GO:0003700"]
GO_coTF = ["GO:0003712", "GO:0001098", "GO:0002039" , "GO:0008134" , "GO:0042393", "GO:0046332", "GO:0006325", "GO:0140993"]

# Columns from TFCheckpoint used:
TFCheckpoint_cols = {
    'dbTF': ['TFclass.present.merged', 'lambert_2018.present', 'Lovering_2021.present'],
    'coTF': ['animal_tfdb_Homo_sapiens_cofactors.present', 'animal_tfdb_Mus_musculus_cofactors.present', 'animal_tfdb_Rattus_norvegicus_cofactors.present', 
             'tcof_cotf_human.present', 'tcof_cotf_mouse.present']
}

In [23]:
# PATHS
in_data_path = '../../data/external/TF_id/'
postprocessing_path = '../../data/postprocessing/'

less_likely_coTFs_path = postprocessing_path + 'all_coTFs_likely_checked_updated_AL.txt'

QuickGO_dbTF_path = in_data_path + "QuickGO-annotations-dbTF.tsv"
QuickGO_coTF_path = in_data_path + "QuickGO-annotations-coTF.tsv"
TFCheckpoint_path = in_data_path + "TFCheckpoint.tsv"

# Define a function to construct the TF types path (ll_coTFs are introduced later)
TF_types = ["dbTF", "coTF"]
def get_TF_ids_path(TF_type, out_data_path):
    return f"{out_data_path}{TF_type}_entrez_code.list"

In [24]:
# LOAD LIKELY & LESS LIKELY COTFs
ll_coTFs_db = pd.read_csv(less_likely_coTFs_path, sep="\t", dtype='str')
m = ll_coTFs_db['likely'] == 'likely'
ll_coTF = set(ll_coTFs_db[~m]['NCBI ID'])

## Get GO terms

Obtained the GO terms from [QuickGO](https://www.ebi.ac.uk/QuickGO/annotations?taxonId=10116,9606,10090&taxonUsage=exact&goId=GO:0140993,GO:0003712,GO:0003700,GO:0140223,GO:0001098,GO:0002039,GO:0008134,GO:0042393,GO:0046332,GO:0006325&goUsageRelationships=is_a,part_of,occurs_in&goUsage=descendants&geneProductSubset=Swiss-Prot&geneProductType=protein), using the terms shown below. Used as filters:

* **Taxon:** 10116, 9606, 10090, Exact match (do not include descendants)
* **Gene products:** Reviewed (not Unreviewed)
* **GO terms:**.
  * **dbTF:** GO:0003700 (DNA-binding transcription factor activity)
  * **coTF:** GO:0140993 (histone modifying activity), GO:0008134 (transcription factor binding), GO:0003712 (transcription coregulator activity), GO:0001098 (basal transcription machinery binding), GO:0002039 (p53 binding), GO:0042393 (histone binding), GO:0046332 (SMAD binding) and GO:0006325 (chromatin organization)
* **Export as:** tsv

Downloaded separately a QuickGO tsv file for each TF type and renamed it as shown above in the setup section.

As some terms can be identified as pertaining to more than 1 type, we have followed this hierarchy to remove duplicates:
1. dbTF
2. coTF

That implies that if a protein is classified as both dbTF and coTF, the protein's classification will be dbTF.

In [25]:
# VARIABLES
# Species:
organismToTaxID = {
    "hsapiens": "9606",
    "mmusculus": "10090",
    "rnorvegicus": "10116"}

In [26]:
# FUNCTIONS
def fetch_gene_ids_gprofiler(gene_symbols: list, organism: str) -> dict:
    "Get NCBI Gene IDs from GProfiler"
    symboltoID = {}

    # Query the IDs from GProfiler
    result = requests.post(
        url='https://biit.cs.ut.ee/gprofiler/api/convert/convert/',
        json={
            'organism': organism,
            'target':'ENTREZGENE_ACC',
            'query': gene_symbols,
        }
        )
    
    # Create a list of extracted IDs per symbol
    for r in result.json()['result']:
        incoming = r['incoming']
        converted = r['converted']

        if incoming not in symboltoID:
            symboltoID[incoming] = []
        if converted != 'None':
            symboltoID[incoming].append(converted)

    return symboltoID

def retrieve_annotations_entrez(id_list):
    """Annotates Entrez Gene IDs using Bio.Entrez, in particular epost (to
    submit the data to NCBI) and esummary to retrieve the information.
    Returns a list of dictionaries with the annotations."""

    request = Entrez.epost("gene", id=",".join(id_list))
    result = Entrez.read(request)
    webEnv = result["WebEnv"]
    queryKey = result["QueryKey"]
    data = Entrez.esummary(db="gene", webenv=webEnv, query_key=queryKey)
    annotations = Entrez.read(data)
    annotationsSummary = annotations['DocumentSummarySet']['DocumentSummary']

    assert len(id_list) == len(annotationsSummary), f"id_list and annotationsSummary are of different length: {len(id_list)} != {len(annotationsSummary)}"

    return annotationsSummary

In [27]:
# JOIN QUICKGO TSVs
# Create joined DataFrame from the 3 TF types
QuickGO_dbTF = pd.read_csv(QuickGO_dbTF_path, sep='\t', header=0, keep_default_na=False, dtype='str')
QuickGO_dbTF['TF type'] = 'dbTF'
QuickGO_coTF = pd.read_csv(QuickGO_coTF_path, sep='\t', header=0, keep_default_na=False, dtype='str')
QuickGO_coTF['TF type'] = 'coTF' 

QuickGO = pd.concat([QuickGO_dbTF, QuickGO_coTF], axis=0)

print(f"{len(QuickGO['SYMBOL'])} rows were retrieved.")

# Only keep relevant columns
QuickGO = QuickGO[['SYMBOL', 'TAXON ID', 'TF type', 'GO TERM']]

# Drop repeated cells. Use the following priority if duplicates of different TF type
priority = {'dbTF': 0, 'coTF': 1}
QuickGO['priority'] = QuickGO['TF type'].map(priority)
QuickGO = QuickGO.sort_values(by=['SYMBOL', 'TAXON ID', 'priority'])
QuickGO = QuickGO.drop_duplicates(subset=['SYMBOL', 'TAXON ID', 'GO TERM'], keep='first')
QuickGO = (
    QuickGO
    .groupby(["SYMBOL", "TAXON ID"], as_index=False)
    .agg({
        'TF type': lambda x: ";".join(sorted(set(x.astype(str)))),
        'GO TERM': 'first'   # take top-priority GO TERM
    })
)

# Show results
print(f"Removing duplicates, we retrieve {len(QuickGO['SYMBOL'])} symbols:")
for TF_type in ('dbTF', 'coTF'):
    print(f"\t{len(QuickGO[QuickGO['TF type'] == TF_type]['SYMBOL'])} {TF_type}s")

25053 rows were retrieved.
Removing duplicates, we retrieve 6005 symbols:
	2184 dbTFs
	3156 coTFs


In [28]:
# GET GENE IDs FROM GPROFILER
for organism in ['hsapiens', 'mmusculus', 'rnorvegicus']:
    # Get IDs from GProfiler
    symbols = list(QuickGO[QuickGO['TAXON ID'] == organismToTaxID[organism]]['SYMBOL'].unique())
    symboltoID = fetch_gene_ids_gprofiler(symbols, organism)

    # Map them to QuickGO db
    m = QuickGO['TAXON ID'] == organismToTaxID[organism]
    QuickGO.loc[m, "TF ID"] = QuickGO[m]['SYMBOL'].apply(lambda symbol: symboltoID[symbol])

m = ~(QuickGO['TF ID'].str.len() == 0)
print(f'GProfiler retrieved {m.sum() / len(QuickGO):.1%} NCBI Gene IDs from the QuickGO symbols')
print(f"It couldn't retrieve {(~m).sum()} of them")

GProfiler retrieved 98.2% NCBI Gene IDs from the QuickGO symbols
It couldn't retrieve 108 of them


We retrieve the rest through a query to Entrez.

In [29]:
# QUERY THE REST FROM ENTREZ
# Entrez often gets stuck, so it's best to give some time between queries
import time
ids = []
for TaxID in ['9606', '10090', '10116']:

    # Get the symbols with missing ID
    m = (QuickGO['TF ID'].str.len() == 0) & (QuickGO['TAXON ID'] == TaxID)
    missing_symbols = list(QuickGO[m]['SYMBOL'].unique())

    nSymbols = 15 # Symbols per query
    for i in range(0, len(missing_symbols), nSymbols):
        symbols = missing_symbols[i:i+nSymbols]
        # Query them from Entrez
        symbolsQuery = sorted([s+'[Preferred Symbol]' for s in symbols])
        query = f'({" OR ".join(symbolsQuery)}) AND txid{TaxID}[Organism]'
        handle = Entrez.esearch(db="gene", term=query, retmode="xml")
        record = Entrez.read(handle)
        ids.append(record.get("IdList", []))
        print(TaxID)
        # Sleep between queries to not get blocked
        time.sleep(5)

9606
9606
9606
10090
10090
10116
10116
10116
10116


In [30]:
# Map IDs back to its symbol & organism
all_ids = [j for i in ids for j in i]
remaining_annotations = retrieve_annotations_entrez(all_ids)

# Make a map from symbol/TaxID to Gene ID
symboltoID_entrez = {'9606': {}, '10116': {}, '10090': {}}
for id, ann in zip(all_ids, remaining_annotations):
    symbol = ann['Name']
    TaxID = ann['Organism']['TaxID']

    if symbol not in symboltoID_entrez[TaxID]:
        symboltoID_entrez[TaxID][symbol] = [id]
    else:
        symboltoID_entrez[TaxID][symbol].append(id)

# Check how many we retrieved from Entrez
m = (QuickGO['TF ID'].str.len() == 0)
print(f"{len(all_ids)} out of the remaining {m.sum()} missing have been retrieved through Entrez.")

# Map the retrieved ones to the QuickGO db
for TaxID in symboltoID_entrez.keys():
    m = (QuickGO['TF ID'].str.len() == 0) & (QuickGO['TAXON ID'] == TaxID)
    QuickGO.loc[m, "TF ID"] = QuickGO[m]['SYMBOL'].apply(lambda symbol: symboltoID_entrez[TaxID].get(symbol, []))

m = ~(QuickGO['TF ID'].str.len() == 0)
print(f'Combined with Entrez, we retrieved {m.sum() / len(QuickGO):.1%} NCBI Gene IDs from the QuickGO symbols')
print(f"There's {(~m).sum()} NCBI Gene IDs that couldn't be retrieved")

79 out of the remaining 108 missing have been retrieved through Entrez.
Combined with Entrez, we retrieved 99.4% NCBI Gene IDs from the QuickGO symbols
There's 35 NCBI Gene IDs that couldn't be retrieved


## Get TFCheckpoint terms

In [31]:
## LOAD & PREPROCESS TFCHECKPOINT TSV
# Load TFCheckpoint dataset
TFCheckpoint_df = pd.read_csv(TFCheckpoint_path, sep='\t', header=0)
str_cols = ['Associated.Gene.Name', 'Synonyms', 'Official name', 'Entrez.Taxa.ID', 'Entrez.Gene.ID', 'UniProt.SwissProt.Accession', 'Ensembl.Gene.ID']
TFCheckpoint_df[str_cols] = TFCheckpoint_df[str_cols].astype(str)


# Split Entrez, Taxa & UniProt into individual IDs
TFCheckpoint_df['EntrezIDs'] = TFCheckpoint_df['Entrez.Gene.ID'].str.split('|')
TFCheckpoint_df['TaxaIDs'] = TFCheckpoint_df['Entrez.Taxa.ID'].str.split('|')
TFCheckpoint_df['UniProt'] = TFCheckpoint_df['UniProt.SwissProt.Accession'].str.split('|')
TFCheckpoint_df['Ensembl'] = TFCheckpoint_df['Ensembl.Gene.ID'].str.split('|')

# Explode the TF
TFCheckpoint_exploded = TFCheckpoint_df.explode(['EntrezIDs', 'TaxaIDs', 'UniProt', 'Ensembl'])
TFCheckpoint_exploded = TFCheckpoint_exploded[TFCheckpoint_exploded["EntrezIDs"] != ''] # Drop empty rows (Appeared when | was present at the end, e.g. "9454|3425|")
TFCheckpoint_exploded = TFCheckpoint_exploded[TFCheckpoint_exploded["UniProt"] != '']
TFCheckpoint_exploded = TFCheckpoint_exploded[TFCheckpoint_exploded["Ensembl"] != '']

# Check whether each EntrezID only matches to 1 TaxaID:
gene_taxa_unique = TFCheckpoint_exploded.drop_duplicates(subset=["EntrezIDs", "TaxaIDs"], keep='first')
gene_taxa_mismatch = gene_taxa_unique[gene_taxa_unique.duplicated(subset=["EntrezIDs"], keep=False)]
h4("EntrezIDs mapped to 2 species")
md(f"There are {len(gene_taxa_mismatch['EntrezIDs'].unique())} Entrez IDs that are mapped to both Rat and Mouse:")
display(HTML(gene_taxa_mismatch[["EntrezIDs", "Associated.Gene.Name", "TaxaIDs"]].sort_values(by=['EntrezIDs']).to_html(index=False)))

# DROP MISMATCHING ROWS
rows_to_drop = [
    ['STAT5A', '20851', '10116'],
    ['STAT5A', '25126', '10090'],
    ['ZFY', '367832', '10090'],
    ['ZFY', '22764', '10116'],
    ['STAT5B', '24918', '10116']
]
for row in rows_to_drop:
    to_drop = (TFCheckpoint_exploded["Associated.Gene.Name"] == row[0]) & (TFCheckpoint_exploded["EntrezIDs"] == row[1])
    assert to_drop.sum() == 1, f"{to_drop.sum()} rows are being dropped instead of 1"
    TFCheckpoint_exploded = TFCheckpoint_exploded[~to_drop]
to_change = (TFCheckpoint_exploded["Associated.Gene.Name"] == "STAT5A") & (TFCheckpoint_exploded["EntrezIDs"] == "24918")
assert to_change.sum() == 1, f"{to_change.sum()} rows are being dropped instead of 1"
TFCheckpoint_exploded.loc[to_change, "TaxaIDs"] = "10116"
md("They have been searched in the NCBI and corrected manually")

# Assert there's no duplicates anymore
gene_taxa_unique = TFCheckpoint_exploded.drop_duplicates(subset=["EntrezIDs", "TaxaIDs"], keep='first')
gene_taxa_mismatch = gene_taxa_unique[gene_taxa_unique.duplicated(subset=["EntrezIDs"], keep=False)]
assert len(gene_taxa_mismatch) == 0, f"There's still {len(gene_taxa_mismatch)} duplicated rows" 

<h4>EntrezIDs mapped to 2 species</h4>

There are 5 Entrez IDs that are mapped to both Rat and Mouse:

EntrezIDs,Associated.Gene.Name,TaxaIDs
20851,STAT5A,10116
20851,STAT5B,10090
22764,ZFX,10090
22764,ZFY,10116
24918,STAT5A,10090
24918,STAT5B,10116
25126,STAT5A,10090
25126,STAT5B,10116
367832,ZFX,10116
367832,ZFY,10090


They have been searched in the NCBI and corrected manually

In [32]:
# GROUP DUPLICATED ROWS & GET FINAL TFCHECKPOINT DATASET
# In some rows, EntrezID, TaxaID & Name are the same -> Only SwissProt changes. We will group those rows

# Remove all useless columns
columns_to_keep = TFCheckpoint_exploded.columns.tolist()
columns_to_remove = ['Entrez.Taxa.ID', 'Entrez.Gene.ID', 'UniProt.SwissProt.Accession', 'Ensembl.Gene.ID', 'UniProt', 'Ensembl']
for column in columns_to_remove:
    columns_to_keep.remove(column)

# Group duplicated rows, with a | in between for UniProt & Ensembl.
TFCheckpoint_exploded = TFCheckpoint_exploded.groupby(columns_to_keep, dropna=False).agg({
    "UniProt": lambda x: "|".join(x),
    "Ensembl": lambda x: "|".join(x)
}).reset_index()

# Display one example
mask = TFCheckpoint_exploded["UniProt"].str.contains("\|")
md(f"In {mask.sum()} TFs, one EntrezID is mapped to 2 different SwissProt Accession IDs. They have been joined by |. Example:")
display(HTML(TFCheckpoint_exploded[mask][:2][["Associated.Gene.Name", "Official name", "EntrezIDs", "TaxaIDs", "UniProt", "Ensembl"]].to_html(index=False)))

In 16 TFs, one EntrezID is mapped to 2 different SwissProt Accession IDs. They have been joined by |. Example:

Associated.Gene.Name,Official name,EntrezIDs,TaxaIDs,UniProt,Ensembl
ABL1,Tyrosine-protein kinase ABL1,100909750,10116,E9PT20|F1M0A6,ENSRNOG00000009371|ENSRNOG00000009371
CHCHD2,Coiled-coil-helix-coiled-coil-helix domain-containing protein 2,316643,10116,Q5BJB3|M0R785,ENSRNOG00000051180|ENSRNOG00000051180


In [33]:
TFCheckpoint_sets = {}
for TF_type in TF_types:
    mask = TFCheckpoint_exploded[TFCheckpoint_cols[TF_type]].notna().any(axis=1)  # Checks across the specified columns
    TFCheckpoint_sets[TF_type] = set(TFCheckpoint_exploded[mask]['EntrezIDs'])
    print(f"# {TF_type} NCBI IDs in TFCheckpoint: {len(TFCheckpoint_sets[TF_type]):>5}")

# dbTF NCBI IDs in TFCheckpoint:  4390
# coTF NCBI IDs in TFCheckpoint:  3598


## Get final TF set for the pipeline

In [34]:
# Get TF sets for each TF type
TF_IDs_dict = {}
for TF_type in TF_types:
    TF_IDs = set()

    # Get QuickGO TFs
    QuickGO_subset = QuickGO[QuickGO['TF type'] == TF_type]
    QuickGO_IDs = [j for i in list(QuickGO_subset['TF ID']) for j in i]
    TF_IDs.update(set(QuickGO_IDs))
        
    # Get TFCheckpoint TFs
    TF_IDs.update(TFCheckpoint_sets[TF_type])

    # Save into dictionary
    TF_IDs_dict[TF_type] = TF_IDs


# coTFs must not contain dbTFs.
TF_IDs_dict['coTF'].difference_update(TF_IDs_dict['dbTF'])

# Add the less likely coTFs as a subset of the coTFs
TF_IDs_dict['ll_coTF'] = ll_coTF.intersection(TF_IDs_dict['coTF'])

# Save each of them as a list
for TF_type in TF_types + ['ll_coTF']:
    path = get_TF_ids_path(TF_type, postprocessing_path)
    with open(path, 'w') as f:
        for TF in TF_IDs_dict[TF_type]:
            f.write(TF + "\n")

# Combine all TFs & save them as a list
all_TF_ids = TF_IDs_dict['dbTF'].union(TF_IDs_dict['coTF'])
print(f"We consider {len(all_TF_ids)} NCBI Gene IDs to be TFs")

with open(get_TF_ids_path('tf', postprocessing_path), 'w') as f:
    for TF in all_TF_ids:
        f.write(TF + "\n")

We consider 9195 NCBI Gene IDs to be TFs


## Create final TF table
Create TF table to use in the paper

In [35]:
# FUNCTIONS
import os
from Bio import Entrez
import difflib
Entrez.email = "example24@gmail.com"

def retrieve_annotations(id_list):
    """Annotates Entrez Gene IDs using Bio.Entrez, in particular epost (to
    submit the data to NCBI) and esummary to retrieve the information.
    Returns a list of dictionaries with the annotations."""

    request = Entrez.epost("gene", id=",".join(id_list))
    result = Entrez.read(request)
    webEnv = result["WebEnv"] # type: ignore
    queryKey = result["QueryKey"] # type: ignore
    data = Entrez.esummary(db="gene", webenv=webEnv, query_key=queryKey)
    annotations = Entrez.read(data)
    annotationsSummary = annotations['DocumentSummarySet']['DocumentSummary'] # type: ignore

    assert len(id_list) == len(annotationsSummary), f"id_list and annotationsSummary are of different length: {len(id_list)} != {len(annotationsSummary)}"

    return annotationsSummary

In [None]:
# Join all TF IDs into one dataframe
TFs_list = []
for TF in ['dbTF', 'coTF', 'll_coTF']:
    path = get_TF_ids_path(TF, postprocessing_path) 
    with open(path, 'r') as f:
        all_gene_IDs = f.read().splitlines()
        TFs_list.extend([(gene_id, TF) for gene_id in all_gene_IDs])
TFs_df = pd.DataFrame(TFs_list, columns=["Gene ID", "TF type"])

# Drop coTFs that are also ll_coTFs
coTFs_set = set(TFs_df[TFs_df['TF type'] == 'coTF']['Gene ID'])
TFs_df = TFs_df[~((TFs_df['TF type'] == 'coTF') & (TFs_df['Gene ID'].isin(ll_coTF)))]
assert len(TFs_df) == len(set(TFs_df['Gene ID'])), "There are duplicated Gene IDs in the TFs_df"

# Use eutils to map each gene ID to the gene symbol & TF type
annotationsSummary = retrieve_annotations(TFs_df['Gene ID'].tolist())
EntrezIDtoSymbol = {ID : {'Name': annotation["Name"], 'TaxID': annotation['Organism']['TaxID']} for ID, annotation in zip(TFs_df['Gene ID'].tolist(), annotationsSummary)}
TFs_df['Symbol'] = TFs_df['Gene ID'].map(lambda ID: EntrezIDtoSymbol[ID]['Name'])
TFs_df['TaxID'] = TFs_df['Gene ID'].map(lambda ID: EntrezIDtoSymbol[ID]['TaxID'])


# --- JOIN WITH QUICKGO ---
for GO_term in GO_dbTF + GO_coTF + ["GO:0006355"]:
    # Load the GO term table
    GO_table = pd.read_csv(in_data_path + f"QuickGO-{GO_term.replace(':', '')}.tsv", sep="\t", header=0, dtype='str')

    # Process the GO term table
    GO_table = (
        GO_table[['SYMBOL', 'TAXON ID', 'GO TERM']].drop_duplicates()
        .groupby(["SYMBOL", "TAXON ID"], as_index=False)
        .agg({"GO TERM": lambda x: ";".join(sorted(set(x.dropna().astype(str))))})
        .rename(columns={"GO TERM": GO_term})
    )

    # Merge with TFs_df
    TFs_df = TFs_df.merge(GO_table, left_on=['Symbol', 'TaxID'], right_on=['SYMBOL', 'TAXON ID'], how='left').drop(columns=["TAXON ID", "SYMBOL"])


# --- JOIN WITH TFCHECKPOINT ---
# Join together duplicated rows in TFCheckpoint
cols = [col for cols in TFCheckpoint_cols.values() for col in cols if (("GO:" not in col) & (col != 'TFclass.present.merged'))] + ['TFclass_human', 'TFclass_mouse', 'TFclass_rat'] # Only include relevant columns
print("COLS:", cols)
TFCheckpoint_agg = (
    TFCheckpoint_exploded[['EntrezIDs'] + cols]
    .groupby('EntrezIDs', as_index=False)
    .agg({
        c: (lambda x: ";".join(sorted(set(x.dropna().astype(str)))))
        for c in cols
    })
)

# Merge with TFCheckpoint
TFs_df = TFs_df.merge(TFCheckpoint_agg[cols + ['EntrezIDs']], left_on=['Gene ID'], right_on=['EntrezIDs'], how='left').drop(columns=["EntrezIDs"])

# Ensure sources only have values for the correct species
import numpy as np
TFs_df.loc[TFs_df['TaxID'] != '9606',  ['TFclass_human', 'animal_tfdb_Homo_sapiens_cofactors.present', 'tcof_cotf_human.present', 'lambert_2018.present', 'Lovering_2021.present']] = np.nan
TFs_df.loc[TFs_df['TaxID'] != '10090', ['TFclass_mouse', 'animal_tfdb_Mus_musculus_cofactors.present', 'tcof_cotf_mouse.present']] = np.nan
TFs_df.loc[TFs_df['TaxID'] != '10116', ['TFclass_rat',   'animal_tfdb_Rattus_norvegicus_cofactors.present']] = np.nan

# --- CREATE NEW CATEGORISATION
new_categorisation = {
    'dbTF': GO_dbTF + [col for col in TFCheckpoint_cols['dbTF'] if col not in ['TFclass.present.merged']] + ['TFclass_human', 'TFclass_mouse', 'TFclass_rat'],
    'coTF candidate': GO_coTF + TFCheckpoint_cols['coTF'] + ["GO:0006355"],
    'coTF': ["GO:0003712"]
}

TFs_df = TFs_df.replace("", np.nan) # Replace empty strings with NaN
TFs_df["updated TF type"] = '' # Default
# dbTF will overwrite coTF, which will overwrite coTF candidate
for tf_type in ['coTF candidate', 'coTF', 'dbTF']:
    cols = new_categorisation[tf_type]
    TFs_df.loc[TFs_df[cols].notna().any(axis=1), 'updated TF type'] = tf_type

# --- CLEAN UP & SAVE ---

# Create column if in ExTRI2 dataset
ExTRI2_df = pd.read_csv("../../results/ExTRI2_final_resource.tsv", sep="\t", dtype='str')
TFs_df["In ExTRI"] = TFs_df['Gene ID'].isin(ExTRI2_df['TF Id'])

# Add orthologs information
all_orthologs_df = pd.read_csv("../../data/postprocessing/tables/orthologs_final.tsv", sep="\t", dtype='str').set_index('Gene_ID')
TFs_df['human_gene_ID'] = TFs_df['Gene ID'].map(all_orthologs_df['unique_human_gene_ID'])
TFs_df['human_symbol']      = TFs_df['Gene ID'].map(all_orthologs_df['unique_human_gene_symbol'])
TFs_df['hgnc_id']           = TFs_df['Gene ID'].map(all_orthologs_df['unique_HGNC_ID'])

m_missing = (TFs_df['In ExTRI'] & ~TFs_df['Gene ID'].isin(all_orthologs_df.index))
print(f"{m_missing.sum()} TFs in ExTRI2 are not present in the orthologs table ({m_missing.sum() / TFs_df['In ExTRI'].sum():.1%})")


# Assertions
assert len(TFs_df) == len(set(TFs_df['Gene ID'])), "There are duplicated Gene IDs in the TFs_df"

# Sort rows
order = ["dbTF", "coTF", "coTF candidate", ""]
TFs_df["updated TF type"] = pd.Categorical(TFs_df["updated TF type"], categories=order, ordered=True)
TFs_df = TFs_df.sort_values(by=['In ExTRI', 'updated TF type', 'human_symbol', 'TaxID'], ascending=[False, True, True, True])

# Save the complete dataset
TFs_df.to_csv("../../analysis/tables/all_TFs.tsv", sep="\t", index=False)

# --- CREATE PAPER TF TABLES ---
# Save the 2 tables we'll include in the paper

md(f'We will discard {TFs_df[TFs_df["updated TF type"] == ""].shape[0]} TFs that are not classified into any TF type')
TFs_df[TFs_df['updated TF type'] != ''][['Gene ID', 'Symbol', 'TaxID']].to_csv("../../data/paper_tables/all_considered_TFs.tsv", sep="\t", index=False)

# Create table with only TFs in ExTRI2
TFs_in_ExTRI2 = (TFs_df
    .drop(columns=['TF type']).rename(columns={'updated TF type': 'TF type'})
    .loc[TFs_df['In ExTRI'] & (TFs_df['updated TF type'] != '')].drop(columns=['In ExTRI'])
)

# Create 'human_TF_type' column, which prioritises the human TF type, and otherwise, it follows the priority order:
priority = {'dbTF': 3, 'coTF': 2, 'coTF candidate': 1} # dbTF > coTF > coTF candidate

# Function to resolve TF type per human_gene_ID
def resolve_human_tf_type(df):
    # If only one unique TF type, keep it
    if df['TF type'].nunique() == 1:
        return df['TF type'].iloc[0]
    
    # Prefer the human ortholog (TaxID == 9606)
    human_rows = df[df['TaxID'].astype(str).str.contains('9606')]
    if not human_rows.empty:
        return human_rows['TF type'].iloc[0]
    
    # Otherwise, choose by priority
    tf_types = sorted(df['TF type'].unique(), key=lambda t: priority.get(t, 0), reverse=True)
    return tf_types[0]

# Create mapping from human_gene_ID → resolved TF type and add it to the main DataFrame
tf_type_map = (
    TFs_in_ExTRI2.groupby(['human_gene_ID'])[['TF type', 'TaxID']]
    .apply(resolve_human_tf_type).to_dict()
)
TFs_in_ExTRI2['human_TF_type'] = TFs_in_ExTRI2['human_gene_ID'].map(tf_type_map)


# Save the table
TFs_in_ExTRI2.to_csv("../../data/paper_tables/TFs_in_ExTRI2.tsv", sep="\t", index=False)

# --- SHOW STATS ---
pd.set_option('display.max_columns', None)
display(TFs_df.head(2))
display(TFs_df['updated TF type'].value_counts(dropna=False))

# Show updated TF types per GO term
summary = (
    TFs_df
    .melt(id_vars="updated TF type", value_vars=[c for c in TFs_df.columns if c.startswith("GO:")],
          var_name="GO term", value_name="present")
    .assign(present=lambda d: d["present"].notna())
    .query("present == True")
    .groupby(["GO term", "updated TF type"], observed=True)
    .size()
    .unstack(fill_value=0)       # columns = updated TF type
)
summary["Total"] = summary.sum(axis=1)
display(summary.sort_values("Total", ascending=False))

print("TFs in ExTRI2 vs human_TF_type:")
display(TFs_in_ExTRI2[['TF type', 'human_TF_type']].value_counts(dropna=False))


COLS: ['lambert_2018.present', 'Lovering_2021.present', 'animal_tfdb_Homo_sapiens_cofactors.present', 'animal_tfdb_Mus_musculus_cofactors.present', 'animal_tfdb_Rattus_norvegicus_cofactors.present', 'tcof_cotf_human.present', 'tcof_cotf_mouse.present', 'TFclass_human', 'TFclass_mouse', 'TFclass_rat']
17 TFs in ExTRI2 are not present in the orthologs table (0.3%)


We will discard 528 TFs that are not classified into any TF type

Unnamed: 0,Gene ID,TF type,Symbol,TaxID,GO:0003700,GO:0003712,GO:0001098,GO:0002039,GO:0008134,GO:0042393,GO:0046332,GO:0006325,GO:0140993,GO:0006355,lambert_2018.present,Lovering_2021.present,animal_tfdb_Homo_sapiens_cofactors.present,animal_tfdb_Mus_musculus_cofactors.present,animal_tfdb_Rattus_norvegicus_cofactors.present,tcof_cotf_human.present,tcof_cotf_mouse.present,TFclass_human,TFclass_mouse,TFclass_rat,updated TF type,In ExTRI,human_gene_ID,human_symbol,hgnc_id
2221,54608,dbTF,Abhd2,10090,GO:0003707,,,,,,,,,GO:0003707,,,,,,,,,,,dbTF,True,11057,ABHD2,HGNC:18717
285,11057,dbTF,ABHD2,9606,GO:0003707,,,,,,,,,GO:0003707,,,,,,,,,,,dbTF,True,11057,ABHD2,HGNC:18717


updated TF type
dbTF              4161
coTF candidate    3573
coTF               933
                   528
Name: count, dtype: int64

updated TF type,dbTF,coTF,coTF candidate,Total
GO term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GO:0006355,3062,914,1657,5633
GO:0003700,2646,0,0,2646
GO:0006325,349,271,1168,1788
GO:0008134,609,327,484,1420
GO:0003712,209,933,0,1142
GO:0140993,88,113,268,469
GO:0042393,42,54,269,365
GO:0046332,74,30,92,196
GO:0002039,20,41,92,153
GO:0001098,28,19,95,142


TFs in ExTRI2 vs human_TF_type:


TF type         human_TF_type 
dbTF            dbTF              2652
coTF candidate  coTF candidate    2190
coTF            coTF               727
coTF candidate  coTF               111
                dbTF                60
                NaN                 39
dbTF            NaN                 27
coTF            dbTF                15
                coTF candidate      10
                NaN                  9
dbTF            coTF                 6
                coTF candidate       5
Name: count, dtype: int64

In [37]:
print("TFs without updated TF type:", (TFs_df['updated TF type'] == '').sum())
print("TFs in ExTRI2 without updated TF type:  ", (TFs_df[TFs_df['In ExTRI']]['updated TF type'] == '').sum())
m = TFs_df['In ExTRI'] & (TFs_df['updated TF type'] == '')
print("ExTRI2 sentences with TFs without updated TF type:", ExTRI2_df['TF Id'].isin(TFs_df[m]['Gene ID']).sum(), (ExTRI2_df['TF Id'].isin(TFs_df[m]['Gene ID'])).sum() / len(ExTRI2_df))

TFs without updated TF type: 528
TFs in ExTRI2 without updated TF type:   149
ExTRI2 sentences with TFs without updated TF type: 3372 0.004035317404280883


In [38]:
# Check how many GO:0006355 we retrieve
GO_term = "GO:0006355"

# Process the GO term table
GO_table = pd.read_csv(in_data_path + f"QuickGO-{GO_term.replace(':', '')}.tsv", sep="\t", header=0, dtype='str')

GO_table = (
    GO_table[['SYMBOL', 'TAXON ID', 'GO TERM']].drop_duplicates()
    .groupby(["SYMBOL", "TAXON ID"], as_index=False)
    .agg({"GO TERM": lambda x: ";".join(sorted(set(x.dropna().astype(str))))})
    .rename(columns={"GO TERM": GO_term})
)

print(f"Unique {GO_term} entries: {GO_table.shape[0]}")
print(f"Number considered: {TFs_df[GO_term].notna().sum()}")

Unique GO:0006355 entries: 7338
Number considered: 5633


TF type         human_TF_type 
dbTF            dbTF              2652
coTF candidate  coTF candidate    2190
coTF            coTF               727
coTF candidate  coTF               111
                dbTF                60
                NaN                 39
dbTF            NaN                 27
coTF            dbTF                15
                coTF candidate      10
                NaN                  9
dbTF            coTF                 6
                coTF candidate       5
Name: count, dtype: int64

In [81]:
m = final_TFs_grouped['TF type'].isin(['coTF', 'dbTF', 'coTF candidate']) | (final_TFs_grouped['human_symbol'] == 'None')
print(f"Rows in ExTRI2 affected: {ExTRI2_df['human_symbol'].isin(final_TFs_grouped[~m]['human_symbol']).sum()} ({ExTRI2_df['human_symbol'].isin(final_TFs_grouped[~m]['human_symbol']).mean():.2%})")
display(final_TFs_grouped[~m][['TF type']].value_counts())
display(final_TFs_grouped[~m])

# TODO - Create in "TFs_in_ExTRI2" an extra column where each TF_human_symbol is mapped to only one TF_type.
# Clashes in a few sentences. We will exclude them from CollecTRI.

KeyError: 'human_symbol'