In [2]:
# IMPORTS
import pandas as pd
import numpy as np
import os
import json
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib_venn import venn3
import matplotlib.colors as mcolors

import upsetplot
import warnings
import re
from analysis import *

from Bio import Entrez
# *Always* tell NCBI who you are
Entrez.email = "example24@gmail.com"

## Custom functions
import sys

sys.path.append('../scripts/common/')
from notebook_utils import h3, h4, h5, md
from analysis import load_config
pd.set_option('display.max_colwidth', 20)

In [6]:
# LOAD DATA
config = load_config()

ExTRI2_df = pd.read_csv(config['paper_ExTRI2_p'], sep="\t", dtype=str, keep_default_na=False)
collectri = pd.read_csv("../data/paper_tables/CollecTRI2.tsv.gz", sep='\t', dtype="str", skiprows=1)
collectri_regulome = pd.read_csv("../data/paper_tables/CollecTRI2-regulome.tsv.gz", sep='\t', dtype="str", skiprows=1)
all_TFs_df = pd.read_csv("tables/all_TFs.tsv", sep='\t', dtype="str")
all_orthologs_df = pd.read_csv('../data/postprocessing/tables/orthologs_final.tsv', sep="\t", dtype='str').set_index('Gene_ID')

In [3]:
# TODO - For the final version ensure ExTRI2 & CollecTRI[ExTRI] have the same content
ExTRI2_TRIs = ExTRI2_df[['TF_human_symbol', 'TG_human_symbol']].drop_duplicates()
ExTRI2_collectri = collectri[collectri['[ExTRI2] present'].notna()]

print(len(ExTRI2_TRIs), len(ExTRI2_collectri))

225927 222387


In [4]:
# TODO - Remove this. It is just to show why the 573 ID is no longer in ExTRI2:
# "updated_TF_type" is NaN, as it has no source, so we remove it
all_TFs_df[all_TFs_df['Gene ID'] == '573']
# TODO - I remember at some point I had 1000 less dbTFs than before. Recheck why. Ensure I didn't lose a lot of TFs in the process. 
# But ExTRI2 did not lose many sentences so... idk

Unnamed: 0,Gene ID,TF type,Symbol,TaxID,GO:0003700,GO:0003712,GO:0001098,GO:0002039,GO:0008134,GO:0042393,...,tcof_cotf_human.present,tcof_cotf_mouse.present,TFclass_human,TFclass_mouse,TFclass_rat,updated TF type,In ExTRI,human_gene_ID,human_symbol,hgnc_id
6177,573,coTF,BAG1,9606,,,,,,,...,,,,,,,True,573,BAG1,HGNC:937


In [None]:
# TODO - Something weird is happening with the AP1/NFKB families in collectri. I did this to check.
# Remove once the problem has been identified :)
weird_TFs = ['NFKB2', 'RELB', 'RELA', 'NFKB1', 'JUN', 'NFKB', 'AP1']

# --- Unique TF–TG pairs in ExTRI2 ---
unique_pairs = ExTRI2_df[['TF_human_symbol', 'TG_human_symbol']].drop_duplicates()

# Count how many unique pairs each TF appears in
extri_counts = (
    unique_pairs['TF_human_symbol']
    .value_counts()
    .reindex(weird_TFs, fill_value=0)
    .rename('TRIs In ExTRI2')
)

# --- Collectri counts (already unique TF–TG pairs) ---
collectri_counts = (
    collectri['Transcription Factor (Associated Gene Name)']
    .value_counts()
    .reindex(weird_TFs, fill_value=0)
    .rename('TRIs in collecTRI')
)

# --- Combine into one summary table ---
weird_summary = pd.concat([extri_counts, collectri_counts], axis=1).astype(int)
weird_summary.index.name = 'TF'
display(weird_summary.reset_index(drop=False).style.hide(axis="index"))

TF,TRIs in ExTRI,TRIs in collecTRI
NFKB2,147,166
RELB,186,194
RELA,1055,1190
NFKB1,385,615
JUN,928,1101
NFKB,2747,0
AP1,1218,0


In [6]:
# TODO - collecTRI's extri TF type is the rodent one, not the human one.
# Is it worth for me to fix it? Is it a useful field, really?
m = collectri['[ExTRI2] Transcription Factor Type'].str.contains(';', na=False)
display(collectri[m].head())
collectri['[ExTRI2] Transcription Factor Type'].value_counts(dropna=False)

Unnamed: 0,#TF:TG,Transcription Factor (Associated Gene Name),Target Gene (Associated Gene Name),[ExTRI2] present,[ExTRI2] PMID,[ExTRI2] Sign,[ExTRI2] Transcription Factor Type,[HTRI] present,[HTRI] Technique,[HTRI] PMID,...,[DoRothEA_A] PMID,[DoRothEA_A] Directed,[DoRothEA_A] Effect,Lambert,Lovering,GO:0003700,GO:0140223,GO:0003712,TFClass,Auto-regulation
127,MAPK3:BGLAP,MAPK3,BGLAP,ExTRI2,25017640|2937262...,UP|UP|UP|UNKNOWN...,coTF candidate;c...,,,,...,,,,,,,,,,
128,MAPK1:BGLAP,MAPK1,BGLAP,ExTRI2,25017640|2937262...,UP|UP|UP|UP|UNKN...,coTF candidate;c...,,,,...,,,,,,,,,,
152,MAPK1:BAX,MAPK1,BAX,ExTRI2,11761456|2965252...,UNKNOWN|UNKNOWN|...,coTF candidate;c...,,,,...,,,,,,,,,,
183,MAPK3:BMP2,MAPK3,BMP2,ExTRI2,25017640|3183177...,UP|UP|UP|UP|UNKN...,coTF candidate;c...,,,,...,,,,,,,,,,
184,MAPK1:BMP2,MAPK1,BMP2,ExTRI2,25017640|3183177...,UP|UP|UP|UP|UP|U...,coTF candidate;c...,,,,...,,,,,,,,,,


[ExTRI2] Transcription Factor Type
dbTF                                   104659
coTF candidate                          80132
coTF                                    28189
NaN                                     11635
coTF candidate;coTF candidate            3436
                                        ...  
coTF candidate;-;-;coTF candidate;-         2
coTF;coTF candidate;dbTF                    2
dbTF;dbTF;coTF candidate                    2
-;coTF candidate;coTF                       1
coTF;dbTF;coTF candidate                    1
Name: count, Length: 89, dtype: int64

In [7]:
# SOLVED - THERE AREHUMAN GENES WITHOUT UPPERCASE
def load_all_human_genes():
    # Load all human TGs
    # downloaded from: https://www.ncbi.nlm.nih.gov/datasets/gene/taxon/9606/
    all_human_genes = pd.read_csv(config['all_human_TGs_p'], sep='\t', header=0, dtype=str)
    all_human_genes = all_human_genes[['NCBI GeneID', 'Symbol', 'Gene Type', 'Taxonomic Name', 'Description']]
    all_human_genes.loc[all_human_genes['Gene Type'].isna(), 'Gene Type'] = 'nan'

    # Add AP1 and NFKB complexes
    all_human_genes = pd.concat([all_human_genes, pd.DataFrame({
        'NCBI GeneID': ['Complex:AP1', 'Complex:NFKB'], 
        'Symbol': ['AP1', 'NFKB'],
        'Description': ['', ''],
        'Taxonomic Name': ['Homo sapiens', 'Homo sapiens'],
        'Gene Type': ['PROTEIN_CODING', 'PROTEIN_CODING'],
        })], ignore_index=True)
    
    # Ensure the data is expected: all human, no duplicate IDs
    assert all_human_genes['Taxonomic Name'].nunique() == 1
    assert all_human_genes['Taxonomic Name'].unique()[0] == 'Homo sapiens'
    assert (all_human_genes['NCBI GeneID'].duplicated()).sum() == 0 

    return all_human_genes
all_human_genes = load_all_human_genes()
tf_col = 'Transcription Factor (Associated Gene Name)'

# Show human genes with non-uppercase symbols
m_lowercase = all_human_genes['Symbol'] != all_human_genes['Symbol'].str.upper()
m_orfs = m_lowercase & all_human_genes['Symbol'].str.contains('orf')

print(f"There are {m_lowercase.sum()} human genes with non-uppercase symbols. "
      f"\nOf these, {m_orfs.sum()} are ORFs (e.g., {all_human_genes[m_orfs]['Symbol'][:3].tolist()})."
      "\nNon-ORFs:")
display(all_human_genes[m_lowercase & ~m_orfs])

# Show if there are others in collecTRI: it is only the orf one!
m = (collectri[tf_col].str.upper() != collectri[tf_col])
print("In CollecTRI, we only have orfs")
display(collectri[m][tf_col].value_counts(dropna=False))

There are 280 human genes with non-uppercase symbols. 
Of these, 277 are ORFs (e.g., ['C9orf72', 'C19orf12', 'C19orf33']).
Non-ORFs:


Unnamed: 0,NCBI GeneID,Symbol,Gene Type,Taxonomic Name,Description
183794,222029,DKFZp434L192,ncRNA,Homo sapiens,uncharacterized ...
184134,259265,bA255A11.4,PSEUDO,Homo sapiens,melanoma antigen...
187485,401282,DKFZp451B082,ncRNA,Homo sapiens,uncharacterized ...


In CollecTRI, we only have orfs


Transcription Factor (Associated Gene Name)
NaN        155
C6orf89      2
Name: count, dtype: int64

In [None]:
# SOLVED - HOW TO ACCUMULATE VALUES FROM A BIG FILE
from collections import Counter

print("ExTRI2 mv")
d = {'Non valid': 0, 'Valid': 0}
for chunk in pd.read_csv('../../ExTRI2_final_resource.mv.tsv.gz', sep='\t', header=1, chunksize=1000000, keep_default_na=False):
    for v in d:
        d[v] += chunk[chunk['Valid'] == v].shape[0]
print(d)

print("Raw ExTRI2 mv")
d = {'Non valid': 0, 'Valid': 0}
for chunk in pd.read_csv('../results/ExTRI2.tsv', sep='\t', header=1, chunksize=1000000, keep_default_na=False):
    for v in d:
        d[v] += chunk[chunk['Valid'] == v].shape[0]
print(d)

ExTRI2 mv
{'Non valid': 21320730, 'Valid': 1149192}
Raw ExTRI2 mv
{'Non valid': 21320730, 'Valid': 1149192}


: 

In [None]:
# COMPARISON OF DIFFERENT VERSIONS OF EXTRI
old_ExTRI2_df = pd.read_csv('../../ExTRI2-06be616/results/ExTRI2_final_resource.tsv', sep="\t", dtype=str, keep_default_na=False)
results_ExTRI2_df = pd.read_csv('../results/ExTRI2.tsv', sep="\t", dtype=str, keep_default_na=False)

m = (
    (ExTRI2_df['TF_human_symbol'] == 'None') | (ExTRI2_df['TG_human_symbol'] == 'None') |
    (ExTRI2_df['TF_human_symbol'] == '') | (ExTRI2_df['TG_human_symbol'] == '') |
    (ExTRI2_df['TF_human_symbol'].str.contains(';')) | (ExTRI2_df['TG_human_symbol'].str.contains(';'))
)
print('paper ExTRI', len(ExTRI2_df), len(ExTRI2_df[~m]), len(ExTRI2_df[~m]['TF_human_symbol'].unique()))

m = (
    (results_ExTRI2_df['TF_human_symbol'] == 'None') | (results_ExTRI2_df['TG_human_symbol'] == 'None') |
    (results_ExTRI2_df['TF_human_symbol'] == '') | (results_ExTRI2_df['TG_human_symbol'] == '') |
    (results_ExTRI2_df['TF_human_symbol'].str.contains(';')) | (results_ExTRI2_df['TG_human_symbol'].str.contains(';'))
)
print('results ExTRI', len(results_ExTRI2_df), len(results_ExTRI2_df[~m]), len(results_ExTRI2_df[~m]['TF_human_symbol'].unique()))

m = (
    (old_ExTRI2_df['TF_human_symbol'] == 'None') | (old_ExTRI2_df['TG_human_symbol'] == 'None') |
    (old_ExTRI2_df['TF_human_symbol'] == '') | (old_ExTRI2_df['TG_human_symbol'] == '') |
    (old_ExTRI2_df['TF_human_symbol'].str.contains(';')) | (old_ExTRI2_df['TG_human_symbol'].str.contains(';'))
)
print('old ExTRI', len(old_ExTRI2_df), len(old_ExTRI2_df[~m]), len(old_ExTRI2_df[~m]['TF_human_symbol'].unique()))


In [None]:
# TODO - Ensure all these things print things that make sense
# TFs checks (they were in get_NCBI_TF_IDs.ipynb)
geneIDs_in_ExTRI2 = {id for col in ['TF Id', 'TG Id'] for ids in ExTRI2_df[col].unique() for id in ids.split(';')}
all_TFs_df["In ExTRI2"] = all_TFs_df['Gene ID'].isin(geneIDs_in_ExTRI2)

print("TFs without updated TF type:", (all_TFs_df['updated TF type'] == '').sum())
print("TFs in ExTRI2 without updated TF type:  ", (all_TFs_df[all_TFs_df['In ExTRI2']]['updated TF type'] == '').sum())
m = all_TFs_df['In ExTRI2'] & (all_TFs_df['updated TF type'] == '')
print("ExTRI2 sentences with TFs without updated TF type:", ExTRI2_df['TF Id'].isin(all_TFs_df[m]['Gene ID']).sum(), (ExTRI2_df['TF Id'].isin(all_TFs_df[m]['Gene ID'])).sum() / len(ExTRI2_df))

m_missing = (all_TFs_df['In ExTRI2'] & ~all_TFs_df['Gene ID'].isin(all_orthologs_df.index))
m_missing_human = m_missing & (all_TFs_df['TaxID'] == '9606')
print(f"{m_missing.sum()} TFs in ExTRI2 are not present in the orthologs table ({m_missing.sum() / all_TFs_df['In ExTRI2'].sum():.1%}), {(m_missing_human).sum()} human ones")


TFs without updated TF type: 0
TFs in ExTRI2 without updated TF type:   0
ExTRI2 sentences with TFs without updated TF type: 0 0.0
18 TFs in ExTRI2 are not present in the orthologs table (0.3%), 4 human ones
