# Tests

## Setup

In [1]:
# IMPORTS
import pandas as pd
import numpy as np
import os
import json
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib_venn import venn3
import matplotlib.colors as mcolors

import upsetplot
import warnings
import re
from analysis import *

from Bio import Entrez
# *Always* tell NCBI who you are
Entrez.email = "example24@gmail.com"

## Custom functions
import sys

sys.path.append('../scripts/common/')
from notebook_utils import h3, h4, h5, md
from analysis import load_config
pd.set_option('display.max_colwidth', 20)

In [2]:
# LOAD DATA
config = load_config()

all_TFs_df = pd.read_csv("tables/all_TFs.tsv", sep='\t', dtype="str")

# Paper data
# TODO - I have removed "keep default NA", because I don't remember why I put it. Maybe there was a reason, though? Make sure it doesnt cause problems
tfs_in_extri2       = pd.read_csv(config['TFs_in_ExTRI2_p'], sep='\t', dtype=str)
ExTRI2_df           = pd.read_csv(config['paper_ExTRI2_p'], sep="\t", dtype=str)
orthologs_df        = pd.read_csv(config['paper_orthologs_p'], sep="\t", dtype=str)
collectri2           = pd.read_csv(config['collectri2_p'], sep='\t', dtype="str", skiprows=1)
collectri2_regulome  = pd.read_csv(config['paper_tables_p'] + 'regulome.tsv.gz', sep='\t', dtype="str", skiprows=1)


## Assertions

In [76]:
# Ensure that all manually retrieved orthologs are both in orthologs_df & tfs_in_extri2
mr_TF_orthologs  = pd.read_csv(config['manually_retrieved_TF_orthologs_p'], sep='\t', dtype=str)
mr_TF_orthologs_geneIDs = mr_TF_orthologs[~mr_TF_orthologs['human_gene_ID'].isna()]['Gene_ID'].tolist()
assert orthologs_df[orthologs_df.index.isin(mr_TF_orthologs_geneIDs)]['unique_human_gene_ID'].isna().sum() == 0, "Some manually added orthologs are missing from orthologs_df"
assert tfs_in_extri2[tfs_in_extri2['Gene ID'].isin(mr_TF_orthologs_geneIDs)]['human_gene_ID'].isna().sum() == 0, "Some manually added orthologs are missing from tfs_in_extri2"

In [77]:
# COMPARE DRIVE & REPO TABLES

drive_folder = '../../2025_28_12_Drive_supp_tables/'
repo_folder = '../data/paper_tables/'

comparison_tables = [
    ('ExTRI2_final_resource.tsv.gz',            'ExTRI2_final_resource.tsv.gz'),
    ('Table_S2_all_considered_tfs.xlsx',        'all_considered_TFs.tsv'),
    ('Table_S10_all_validated_sentences.xlsx',  'validated_sentences.tsv'),
    ('Table_S7_TFs_in_ExTRI2.xlsx',             'TFs_in_ExTRI2.tsv'),
    ('Table_S16_discarded_sents.tsv',           'discarded_sents.tsv'),
    ('Table_S19_orthologs.xlsx',                'orthologs_final.tsv'),
]
# Add folder paths
comparison_tables = [(os.path.join(drive_folder, a), os.path.join(repo_folder, b)) for a, b in comparison_tables]

def normalize_df(df):
    '''Remove trailing/leading spaces from string columns for comparison'''
    return (df.map(lambda x: str(x).strip() if isinstance(x, str) else x))

def load_file(file_path):
    if file_path.endswith('.tsv.gz') or file_path.endswith('.tsv'):
        return pd.read_csv(file_path, sep='\t', dtype=str)
    elif file_path.endswith('.xlsx'):
        return pd.read_excel(file_path, dtype=str)
    else:
        raise ValueError(f"Unsupported file format: {file_path}")

for drive_file, repo_file in comparison_tables:
    drive_df = load_file(drive_file)
    repo_df = load_file(repo_file)
    
    if drive_df.equals(repo_df):
        print(f"Files are identical: {drive_file} and {repo_file}.")

    elif normalize_df(drive_df).equals(normalize_df(repo_df)):
        print(f"Files differ only in whitespace: {drive_file} and {repo_file}.")

    else:
        # Normalise and sort by first column
        norm_drive_df = normalize_df(drive_df).sort_values(by=drive_df.columns[0]).reset_index(drop=True)
        norm_repo_df = normalize_df(repo_df).sort_values(by=repo_df.columns[0]).reset_index(drop=True)

        if norm_drive_df.equals(norm_repo_df):
            print(f"Files differ only in row order: {drive_file} and {repo_file}.")

        else:
            print(f"Differences found between {drive_file} and {repo_file}:")
            try:
                comp = norm_drive_df.compare(norm_repo_df)
                display(comp.head())
            except:
                print("DataFrames have different shapes, cannot compare directly.")


Files are identical: ../../2025_28_12_Drive_supp_tables/ExTRI2_final_resource.tsv.gz and ../data/paper_tables/ExTRI2_final_resource.tsv.gz.
Differences found between ../../2025_28_12_Drive_supp_tables/Table_S2_all_considered_tfs.xlsx and ../data/paper_tables/all_considered_TFs.tsv:


Unnamed: 0_level_0,Symbol,Symbol
Unnamed: 0_level_1,self,other
4990,Cir1,Cirsr


Files differ only in whitespace: ../../2025_28_12_Drive_supp_tables/Table_S10_all_validated_sentences.xlsx and ../data/paper_tables/validated_sentences.tsv.
Files are identical: ../../2025_28_12_Drive_supp_tables/Table_S7_TFs_in_ExTRI2.xlsx and ../data/paper_tables/TFs_in_ExTRI2.tsv.
Files are identical: ../../2025_28_12_Drive_supp_tables/Table_S16_discarded_sents.tsv and ../data/paper_tables/discarded_sents.tsv.
Files are identical: ../../2025_28_12_Drive_supp_tables/Table_S19_orthologs.xlsx and ../data/paper_tables/orthologs_final.tsv.


## Check ExTRI2

In [None]:
# SHOW % OF NaN ORTHOLOGS IN EXTRI2
extri2_df = load_file(comparison_tables[0][1])
extri2_dbtf_df = extri2_df[extri2_df['human_TF_type'] == 'dbTF']

# Check NaNs in new unique human ortholog columns
print("NaN counts in ExTRI2 unique human ortholog columns:")

# Include both the total and the dbTF subset of ExTRI2
cols = ['TF_human_Id', 'TF_human_symbol', 'TG_human_Id', 'TG_human_symbol']
nan_summary = pd.DataFrame(
    {
        'total': [extri2_df[c].isna().sum() for c in cols],
        'unique': [extri2_df[f'unique_{c}'].isna().sum() for c in cols],
        'dbTF_total': [extri2_dbtf_df[c].isna().sum() for c in cols],
        'dbTF_unique': [extri2_dbtf_df[f'unique_{c}'].isna().sum() for c in cols],
    },
    index=cols,
)
display(nan_summary)


NaN counts in ExTRI2 unique human ortholog columns:


Unnamed: 0,total,unique,dbTF_total,dbTF_unique
TF_human_Id,68,16580,0,0
TF_human_symbol,68,16580,0,0
TG_human_Id,9718,17869,5988,9100
TG_human_symbol,9718,17869,5988,9100


## Check CollecTRI

In [5]:
# LOAD & DISPLAY COLLECTRI_EXTRI & COLLECTRI_NTNU SUBSETS
tf_col, tg_col = 'Transcription Factor (Associated Gene Name)', 'Target Gene (Associated Gene Name)'

collectri2_extri2 = collectri2[collectri2['[ExTRI2] present'].notna()][[tf_col, tg_col]+[c for c in collectri2.columns if 'ExTRI2' in c]]
display(collectri2_extri2.head(1))

Unnamed: 0,Transcription Factor (Associated Gene Name),Target Gene (Associated Gene Name),[ExTRI2] present,[ExTRI2] PMID,[ExTRI2] Sign,[ExTRI2] Transcription Factor Type
0,CHCHD2,GNPTG,ExTRI2,35388756,UNKNOWN,coTF candidate


In [9]:
# CHECK THE NTNU SUBSET OF COLLECTRI2
collectri2_ntnu = collectri2[collectri2['[NTNU Curated] present'].notna()]
display(collectri2_ntnu[[tf_col, tg_col]+[c for c in collectri2.columns if 'NTNU' in c]].head(1))
display(collectri2_ntnu.head(1))
print(collectri2.columns.tolist())
# TODO - At the end, we only give the PMID, not the sentence where it was found (right?)


Unnamed: 0,Transcription Factor (Associated Gene Name),Target Gene (Associated Gene Name),[NTNU Curated] present,[NTNU Curated] Sign,[NTNU Curated] PMID
2,MLXIPL,TXNIP,NTNU Curated,+|+||||||||,19411249|2238252...


Unnamed: 0,#TF:TG,Transcription Factor (Associated Gene Name),Target Gene (Associated Gene Name),[ExTRI2] present,[ExTRI2] PMID,[ExTRI2] Sign,[ExTRI2] Transcription Factor Type,[HTRI] present,[HTRI] Technique,[HTRI] PMID,...,[DoRothEA_A] PMID,[DoRothEA_A] Directed,[DoRothEA_A] Effect,Lambert,Lovering,GO:0003700,GO:0140223,GO:0003712,TFClass,Auto-regulation
2,MLXIPL:TXNIP,MLXIPL,TXNIP,ExTRI2,26808438|3660334...,UNKNOWN|UP|UP|UN...,dbTF,,,,...,,,,Lambert,Lovering,GO:0003700,,,TFClass,


['#TF:TG', 'Transcription Factor (Associated Gene Name)', 'Target Gene (Associated Gene Name)', '[ExTRI2] present', '[ExTRI2] PMID', '[ExTRI2] Sign', '[ExTRI2] Transcription Factor Type', '[HTRI] present', '[HTRI] Technique', '[HTRI] PMID', '[HTRI] Confidence', '[TRRUST] present', '[TRRUST] Regulation', '[TRRUST] PMID', '[TFactS] present', '[TFactS] Sign', '[TFactS] Species', '[TFactS] Source', '[TFactS] PMID', '[TFactS] Confidence', '[GOA] present', '[GOA] Sign', '[GOA] PMID', '[IntAct] present', '[IntAct] PMID', '[IntAct] Method ID', '[SIGNOR] present', '[SIGNOR] Effect', '[SIGNOR] Sign', '[SIGNOR] PMID', '[CytReg] present', '[CytReg] Assay type', '[CytReg] species', '[CytReg] Activation/Repression', '[CytReg] PMID', '[CytReg] Year of publication', '[GEREDB] present', '[GEREDB] Effect', '[GEREDB] PMID', '[NTNU Curated] present', '[NTNU Curated] Sign', '[NTNU Curated] PMID', '[Pavlidis2021] present', '[Pavlidis2021] PMID', '[Pavlidis2021] MoR', '[DoRothEA_A] present', '[DoRothEA_A] PM

In [63]:
# COMPARE COLLECTRI2 & EXTRI2
# TODO - For the final version ensure ExTRI2 & CollecTRI[ExTRI] have the same content
ExTRI2_collectri2 = (ExTRI2_df
    [ExTRI2_df[['unique_TF_human_symbol', 'unique_TG_human_symbol']].notna().all(axis=1)]
    [['unique_TF_human_symbol', 'unique_TG_human_symbol', 'human_TF_type', 'PMID', 'MoR']]
    .rename(columns={
        'unique_TF_human_symbol': 'Transcription Factor (Associated Gene Name)', 
        'unique_TG_human_symbol': 'Target Gene (Associated Gene Name)',
        'human_TF_type': '[ExTRI2] Transcription Factor Type',
        'PMID': '[ExTRI2] PMID',
        'MoR': '[ExTRI2] Sign'
        })
)

tf_col, tg_col = 'Transcription Factor (Associated Gene Name)', 'Target Gene (Associated Gene Name)'
merged = (
    ExTRI2_collectri2[[tf_col, tg_col]]
        .drop_duplicates()
        .merge(
        collectri2_extri2[[tf_col, tg_col]],
        on=[tf_col, tg_col],
        how='outer',
        indicator=True
    )   
)

def display_side_by_side(*tables) -> None:
    """
    Display multiple tables side by side in a Jupyter Notebook.
    tables: tuples (data:pd.DataFrame, name:str)
    """
    divs = []

    for data, name in tables:
        if hasattr(data, 'to_frame'):  # Series
            html = data.to_frame(name=name).to_html()
        else:  # DataFrame
            html = data.to_html()

        divs.append(f"<div>{html}</div>")

    display(
        HTML(
            f"""
            <div style="display:flex; gap:40px">
                {''.join(divs)}
            </div>
            """
        )
    )

tris_in_extri2 = merged[merged['_merge'] == 'left_only']
tris_in_collectri2 = merged[merged['_merge'] == 'right_only']
print(f"There are {len(tris_in_extri2)} TRIs only in ExTRI2 and {len(tris_in_collectri2)} TRIs only in collecTRI2.")
print("Showing top 10 TFs involved in unique TRIs in each resource:")
display_side_by_side(
    (tris_in_extri2[tf_col].value_counts().head(10), 'TRIs only in ExTRI2'),
    (tris_in_collectri2[tf_col].value_counts().head(10), 'TRIs only in collecTRI2')
)

There are 9202 TRIs only in ExTRI2 and 20659 TRIs only in collecTRI2.
Showing top 10 TFs involved in unique TRIs in each resource:


Unnamed: 0_level_0,TRIs only in ExTRI2
Transcription Factor (Associated Gene Name),Unnamed: 1_level_1
AP1,1203
CBLL2,281
STING1,231
PRKN,210
H2BC1,208
TGFB1,166
MRTFA,155
TP53,118
NFKB,111
AKT1,98

Unnamed: 0_level_0,TRIs only in collecTRI2
Transcription Factor (Associated Gene Name),Unnamed: 1_level_1
NFKB2,2470
RELB,2439
NFKB1,2279
RELA,1712
MAPK3,1114
JUN,668
MAPK1,415
MAP2K2,350
MAP2K1,226
SMAD2,199


In [None]:
# TODO - Miguel also added GO terms. Did he get them from the TF table? 
# How did he use this table? Because it should all be mapped to human, and these tables map to the rodent one. Understand.

In [None]:
# SHOW NTNU TFs & TGs NOT IN EXTRI2
# TODO - Ask Miguel how did he fix that: how did he map them to human?
print('TFs:', set(collectri2_ntnu[tf_col]) - set(collectri2_extri2[tf_col]))
print('TGs:', set(collectri2_ntnu[tg_col]) - set(collectri2_extri2[tg_col]))

TFs: {'NSD2', 'FUBP1', 'ZBTB8B', 'FUBP3', 'ZBED1', 'BAX', 'NPR1', 'TSC22D3', 'TXK', 'ZFHX2', 'BCCIP', 'GCFC2', 'TFF1'}
TGs: {'SAA3P', 'BPIFA4P', 'IGHE', 'RNU1-1', 'PLAAT3', 'DEPP1', 'ADSS2', 'MIR155HG', 'RPL30', 'CNMD', 'TINAG', 'SNORD79', 'TWNK', 'C6', 'TPK1', 'KIFBP', 'TRARG1', 'HSP90AA2P', 'DYM', 'ERICD', 'PRR9', 'EMC10', 'SIM1', 'SLC22A24', 'MARCHF7', 'H1-6', 'DNAAF4', 'TMEM131L', 'SELENOP', 'RSPH1', 'AMY1A', 'DONSON', 'CCDC59', 'VGLL2', 'TDGF1', 'RIOX2', 'IGHG1', 'NCF1C', 'PRKN', 'LORICRIN', 'JPT1', 'CYP21A1P', 'ARHGDIG', 'CAVIN1', 'LCN8', 'MRE11', 'CTAG1B', 'HBA1', 'DSCR4', 'C1QTNF12', 'LNP1', 'TLCD3B', 'MIR93', 'DEFA3', 'GSDME', 'NFKB', 'HSP90B2P', 'H2AC18', 'MT-ATP6', 'UCA1', 'CCN1', 'TPSD1', 'ARMCX1'}


In [67]:
# TODO - Something weird is happening with the AP1/NFKB families in collectri. I did this to check.
# Remove once the problem has been identified :)
weird_TFs = ['NFKB2', 'RELB', 'RELA', 'NFKB1', 'JUN', 'NFKB', 'AP1']

# --- Unique TF–TG pairs in ExTRI2 ---
unique_pairs = ExTRI2_df[['TF_human_symbol', 'TG_human_symbol']].drop_duplicates()

# Count how many unique pairs each TF appears in
extri_counts = (
    unique_pairs['TF_human_symbol']
    .value_counts()
    .reindex(weird_TFs, fill_value=0)
    .rename('TRIs In ExTRI2')
)

# --- Collectri counts (already unique TF–TG pairs) ---
collectri_counts = (
    collectri2['Transcription Factor (Associated Gene Name)']
    .value_counts()
    .reindex(weird_TFs, fill_value=0)
    .rename('TRIs in collecTRI')
)

# --- Combine into one summary table ---
weird_summary = pd.concat([extri_counts, collectri_counts], axis=1).astype(int)
weird_summary.index.name = 'TF'
display(weird_summary.reset_index(drop=False).style.hide(axis="index"))

TF,TRIs In ExTRI2,TRIs in collecTRI
NFKB2,150,2642
RELB,186,2649
RELA,1065,2774
NFKB1,395,2701
JUN,934,1629
NFKB,2793,2633
AP1,1233,0


In [None]:
# TODO - collecTRI's extri TF type is the rodent one, not the human one. Ensure it is fixed
collectri2['[ExTRI2] Transcription Factor Type'].value_counts(dropna=False)

[ExTRI2] Transcription Factor Type
dbTF                             115822
coTF candidate                    81649
coTF                              27944
NaN                               11481
-                                  1391
dbTF|dbTF                           649
coTF candidate|coTF candidate        96
coTF|coTF                            27
dbTF|dbTF|dbTF                        6
dbTF|dbTF|dbTF|dbTF                   1
coTF|coTF candidate                   1
Name: count, dtype: int64

## Comments from solved old tests

In [73]:
# SHOW DISCREPANCIES BETWEEN AUTOMATIC & MANUAL TF TYPE
print
cols = ['human_gene_ID', 'human_TF_type']

# Get human_tf_type from manually-retrieved orthologs
df1 = mr_TF_orthologs[mr_TF_orthologs['human_gene_ID'].notna()][cols].reset_index(drop=True).sort_values(by=cols).reset_index(drop=True)
# Get human_tf_type from ExTRI
df2 = tfs_in_extri2[tfs_in_extri2['Gene ID'].isin(mr_TF_orthologs_geneIDs)][cols].reset_index(drop=True).sort_values(by=cols).reset_index(drop=True)

print("There are 2 manually curated TFs curated TF type differs from the automatically created ExTRI2 type.\n"\
      "We are keeping the ExTRI2 automatic human_TF_type")
display(df1.merge(df2, on='human_gene_ID', suffixes=('_mr', '_extri')).query('human_TF_type_mr != human_TF_type_extri'))

There are 2 manually curated TFs curated TF type differs from the automatically created ExTRI2 type.
We are keeping the ExTRI2 automatic human_TF_type


Unnamed: 0,human_gene_ID,human_TF_type_mr,human_TF_type_extri
29,4332,coTF,coTF candidate
60,6672,,dbTF


In [74]:
# REASON WHY THERE ARE HUMAN GENES WITHOUT UPPERCASE
def load_all_human_genes():
    # Load all human TGs
    # downloaded from: https://www.ncbi.nlm.nih.gov/datasets/gene/taxon/9606/
    all_human_genes = pd.read_csv(config['all_human_TGs_p'], sep='\t', header=0, dtype=str)
    all_human_genes = all_human_genes[['NCBI GeneID', 'Symbol', 'Gene Type', 'Taxonomic Name', 'Description']]
    all_human_genes.loc[all_human_genes['Gene Type'].isna(), 'Gene Type'] = 'nan'

    # Add AP1 and NFKB complexes
    all_human_genes = pd.concat([all_human_genes, pd.DataFrame({
        'NCBI GeneID': ['Complex:AP1', 'Complex:NFKB'], 
        'Symbol': ['AP1', 'NFKB'],
        'Description': ['', ''],
        'Taxonomic Name': ['Homo sapiens', 'Homo sapiens'],
        'Gene Type': ['PROTEIN_CODING', 'PROTEIN_CODING'],
        })], ignore_index=True)
    
    # Ensure the data is expected: all human, no duplicate IDs
    assert all_human_genes['Taxonomic Name'].nunique() == 1
    assert all_human_genes['Taxonomic Name'].unique()[0] == 'Homo sapiens'
    assert (all_human_genes['NCBI GeneID'].duplicated()).sum() == 0 

    return all_human_genes
all_human_genes = load_all_human_genes()
tf_col = 'Transcription Factor (Associated Gene Name)'

# Show human genes with non-uppercase symbols
m_lowercase = all_human_genes['Symbol'] != all_human_genes['Symbol'].str.upper()
m_orfs = m_lowercase & all_human_genes['Symbol'].str.contains('orf')

print("REASON WHY ARE THERE NON-UPPER-CASE HUMAN SYMBOLS")
print(f"There are {m_lowercase.sum()} human genes with non-uppercase symbols. "
      f"\nOf these, {m_orfs.sum()} are ORFs (e.g., {all_human_genes[m_orfs]['Symbol'][:3].tolist()})."
      "\nNon-ORFs from all_human_genes table:")
display(all_human_genes[m_lowercase & ~m_orfs])

# Show if there are others in collecTRI: it is only the orf one!
m = (collectri2[tf_col].str.upper() != collectri2[tf_col])
print("In CollecTRI, however, we only have orfs. ")
display(collectri2[m][tf_col])

REASON WHY ARE THERE NON-UPPER-CASE HUMAN SYMBOLS
There are 280 human genes with non-uppercase symbols. 
Of these, 277 are ORFs (e.g., ['C9orf72', 'C19orf12', 'C19orf33']).
Non-ORFs from all_human_genes table:


Unnamed: 0,NCBI GeneID,Symbol,Gene Type,Taxonomic Name,Description
183794,222029,DKFZp434L192,ncRNA,Homo sapiens,uncharacterized ...
184134,259265,bA255A11.4,PSEUDO,Homo sapiens,melanoma antigen...
187485,401282,DKFZp451B082,ncRNA,Homo sapiens,uncharacterized ...


In CollecTRI, however, we only have orfs. 


202057    C6orf89
227547    C6orf89
Name: Transcription Factor (Associated Gene Name), dtype: object