**Generate manusript tables**

# Imports

In [1]:
import os
import pathlib
import re
import textwrap

import pandas as pd

In [2]:
version_ps = "0.1.25"

In [3]:
sig_cutoff = .05

# Load data

In [4]:
analysis = 'v9_6'

fpath_project_dir = pathlib.Path(os.path.join(os.getcwd(), os.pardir))
fpath_supplement = fpath_project_dir.resolve().joinpath("supplement")
fpath_dashboard_dir = fpath_supplement.joinpath("scripts").joinpath(analysis)

## Cohort dashboard

In [5]:
fpath_df = fpath_dashboard_dir.joinpath("cohort_dashboard.txt")

dashboard_cohort = pd.read_csv(fpath_df, sep="\t", index_col="#cohort")
dashboard_cohort.index.name = "cohort"

dashboard_cohort.head(2)

Unnamed: 0_level_0,individuals,females,males,n_unknown_sex,total_hpo,total_measurements,hpo_version,gpsea_version,n_total_individual_count,n_alive,n_deceased,n_unknown_vital,n_with_age_of_last_encounter,n_with_onset,n_diseases,disease_string
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ABCB7,18,0,18,0,52,0,2025-01-16,0.9.6,18,0,0,18,0,8,1,"Anemia, sideroblastic, and spinocerebellar ataxia"
ACADM,115,0,0,115,0,31,2025-01-16,0.9.6,115,0,0,115,0,0,1,"Acyl-CoA dehydrogenase, medium chain, deficien..."


## Fisher exact test dashboard

In [6]:
fpath_df = fpath_dashboard_dir.joinpath("fisher_exact_test_dashboard.txt")

dashboard_fet = pd.read_csv(fpath_df, sep="\t", index_col="#cohort_name")
dashboard_fet.index.name = "cohort"

dashboard_fet.head(2)

Unnamed: 0_level_0,total_hpo_testable,total_hpo_tested,a_genotype,b_genotype,nsig
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ABCB7,149,10,p.Gly682Ser,Other variant,0
ABCB7,149,12,ABC transmembrane type-1,Other region,0


## Sig Fisher exact test dashboard

In [7]:
fpath_df = fpath_dashboard_dir.joinpath("sig_fisher_exact_test_dashboard.txt")

dashboard_sig_fet = pd.read_csv(fpath_df, sep="\t", index_col="#cohort_name")
dashboard_sig_fet.index.name = "cohort"

dashboard_sig_fet.head(2)

Unnamed: 0_level_0,a_genotype,b_genotype,nsig,n_tests_performed,hpo_item,with_geno_a,with_geno_b,pval,adj_pval
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ATP13A2,OMIM:606693,OMIM:617225,2,29,Bradykinesia [HP:0002067],30/32 (94%),4/10 (40%),0.000915,0.013
ATP13A2,OMIM:606693,OMIM:617225,2,29,Parkinsonism [HP:0001300],28/28 (100%),3/11 (27%),3e-06,7.8e-05


## Measurement dashboard

In [8]:
fpath_df = fpath_dashboard_dir.joinpath("measurement_dashboard.txt")

dashboard_measurement = pd.read_csv(fpath_df, sep="\t", index_col="#cohort")
dashboard_measurement.index.name = "cohort"

dashboard_measurement.head(2)

Unnamed: 0_level_0,a_genotype,b_genotype,name,test_name,description,variable_name,pval,interpretation,xrefs
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ACADM,K329Q/K329Q,K329Q/other OR other/other,Measurement,t-test,Value of MCAD Activity% [LOINC:74892-1],LOINC:74892-1,6.12e-10,"Mean MCAD activity for K329/K329: 0.52\%, and ...",PMID:33580884
ACADM,Y67H/Y67H OR Y67H/other,other/other,Measurement,t-test,Value of MCAD Activity% [LOINC:74892-1],LOINC:74892-1,2.01e-05,"Mean MCAD activity for Y67H/Y67H: 18.60\%, and...",PMID:33580884


# Compute statistics

In [9]:
# Simplify the access
cohort_individuals = dashboard_cohort['individuals']

In [10]:
cohorts = dashboard_cohort.index.unique()

# All fet, sig fet, and measurement cohorts are in the cohorts dashboard.
assert len(dashboard_fet.index.difference(cohorts)) == 0
assert len(dashboard_sig_fet.index.difference(cohorts)) == 0
assert len(dashboard_measurement.index.difference(cohorts)) == 0

## N cohorts

In [11]:
# Number of analyzed cohorts
n_cohorts = len(cohorts)
n_cohorts

85

## N affected individuals

Robinow syndrome cohorts consist of several Phenopacket Store cohorts.

In [12]:
import ppktstore
from phenopackets.schema.v2.core.individual_pb2 import MALE, FEMALE, UNKNOWN_SEX

# The gene responsible for ROBINOW SYNDROME, AUTOSOMAL DOMINANT 2; DRS2 
rs_ad_gene = "DVL1"

registry = ppktstore.registry.configure_phenopacket_registry()
with registry.open_phenopacket_store(version_ps) as ps:
    cohort = ps.cohort_for_name("DVL1")
    n_rs_ad = len(cohort.phenopackets)
    n_rs_ad_males = len(list(filter(lambda pp: pp.phenopacket.subject.sex == MALE, cohort.phenopackets)))
    n_rs_ad_females = len(list(filter(lambda pp: pp.phenopacket.subject.sex == FEMALE, cohort.phenopackets)))
    n_rs_ad_unknown_sex = len(list(filter(lambda pp: pp.phenopacket.subject.sex == UNKNOWN_SEX, cohort.phenopackets)))

# Count of individuals with `ROBINOW SYNDROME, AUTOSOMAL DOMINANT 2; DRS2`
# which are only members of `Robinow syndrome` cohort.
n_rs_ad, n_rs_ad_males, n_rs_ad_females, n_rs_ad_unknown_sex

(16, 7, 7, 2)

In [13]:
# These GPSEA-CS cohorts consist of multiple Phenopacket Store cohorts.

non_multigene_cohort = dashboard_cohort.index[~(
    dashboard_cohort.index.str.startswith("LDS") |\
    dashboard_cohort.index.str.startswith("Robinow") |\
    dashboard_cohort.index.str.startswith("Kabuki")
)]
non_multigene_cohort

Index(['ABCB7', 'ACADM', 'ACBD6', 'AIRE', 'ANKRD11', 'ASPM', 'ATP13A2',
       'ATP6V0C', 'BRD4', 'CHD8', 'CLDN16', 'CNTNAP2', 'COL3A1', 'COQ4',
       'CTCF', 'CYP21A2', 'EHMT1', 'EZH1', 'FBN1', 'FBXL4', 'FBXO11', 'FGD1',
       'FZD5', 'GLI3', 'HMGCS2', 'IKZF1', 'ITPR1', 'KCNH5', 'KDM6A', 'KDM6B',
       'KMT2D', 'LMNA', 'LZTR1', 'MAPK8IP3', 'MPV17', 'NBAS', 'NF1', 'NIPBL',
       'NKX6-2', 'PIGA', 'POGZ', 'PPP2R1A', 'PTPN11', 'RERE', 'RNU4-2', 'ROR2',
       'RPGRIP1', 'SAMD9L', 'SATB2', 'SCN2A', 'SCO2', 'SEC61A1', 'SETD2',
       'SF3B4', 'SLC32A1', 'SLC45A2', 'SLC4A1', 'SMAD2', 'SMAD3', 'SMARCB1',
       'SMARCC2', 'SON', 'SPTAN1', 'STXBP1', 'SUOX', 'TBCK', 'TBX1', 'TBX5',
       'TGFB2', 'TGFB3', 'TGFBR1', 'TGFBR2', 'TRAF7', 'U2AF2', 'UMOD', 'WWOX',
       'ZFX', 'ZMYM3', 'ZNF462', 'ZSWIM6'],
      dtype='object', name='cohort')

### Number of analyzed individuals

In [14]:
n_individuals = dashboard_cohort.loc[non_multigene_cohort, 'individuals'].sum()

n_individuals += n_rs_ad
n_individuals

np.int64(6179)

### Number of analyzed individuals with sex information available

In [15]:
n_males_females = dashboard_cohort.loc[
    non_multigene_cohort, 
    ['males', 'females'],
].sum()
n_males_females['males'] += n_rs_ad_males
n_males_females['females'] += n_rs_ad_males

n_w_sex_info = n_males_females.sum()
n_w_sex_info

np.int64(5102)

## N genes

In [16]:
#n_lds_genes = 4  # 4 LDS genes: {TGFBR1, TGFBR2, SMAD2, SMAD3}
#n_kabuki = 2 # but each gene has a separate analysis
n_robinow_genes = 1 # 2 Robinow syndrome genes: {DVL1, ROR2} but ROR2 has a separate analysis

n_genes = len(non_multigene_cohort) + n_robinow_genes
n_genes

81

## N diseases

In [17]:
diseases_uq = dashboard_cohort['disease_string'].str.split(';').explode().unique()
n_diseases = len(diseases_uq)
n_diseases

122

## Cohorts with a significant GPC

### Significant FET

In [18]:
fet_cohorts_w_sig_gpc = dashboard_fet.loc[dashboard_fet['nsig'] > 0].index
fet_cohorts_w_sig_gpc

Index(['ATP13A2', 'EHMT1', 'FBN1', 'FBN1', 'FBN1', 'FBN1', 'FBXL4', 'FGD1',
       'GLI3', 'GLI3', 'GLI3', 'IKZF1', 'ITPR1', 'ITPR1', 'ITPR1', 'KCNH5',
       'KDM6A', 'KDM6A', 'Kabuki', 'LDS 1 and 3', 'LDS 3 and 6', 'LMNA',
       'LMNA', 'LMNA', 'LMNA', 'MPV17', 'NBAS', 'NF1', 'NF1', 'NF1', 'NF1',
       'NF1', 'PTPN11', 'RPGRIP1', 'RPGRIP1', 'Robinow', 'SAMD9L', 'SATB2',
       'SCN2A', 'SCN2A', 'SCO2', 'SETD2', 'SETD2', 'SMAD3', 'SMARCB1',
       'SMARCC2', 'SPTAN1', 'SPTAN1', 'SPTAN1', 'SUOX', 'TBCK', 'TBX1', 'TBX5',
       'TBX5', 'TGFBR1', 'TGFBR1', 'UMOD', 'UMOD', 'WWOX', 'ZFX', 'ZMYM3'],
      dtype='object', name='cohort')

### Significant measurement or survival

In [19]:
mes_surv_cohorts_w_sig_gpc = dashboard_measurement.loc[dashboard_measurement['pval'] < sig_cutoff].index
mes_surv_cohorts_w_sig_gpc

Index(['ACADM', 'ACADM', 'AIRE', 'ANKRD11', 'ANKRD11', 'CHD8', 'CHD8',
       'CLDN16', 'CNTNAP2', 'CTCF', 'CYP21A2', 'FBXL4', 'HMGCS2', 'LMNA',
       'MPV17', 'MPV17', 'RERE', 'SETD2', 'SUOX', 'UMOD'],
      dtype='object', name='cohort')

### Cohorts with a significant GPC

In [20]:
cohorts_w_sig_gpc = fet_cohorts_w_sig_gpc.union(mes_surv_cohorts_w_sig_gpc).unique()
n_cohorts_w_sig_gpc = len(cohorts_w_sig_gpc)
n_cohorts_w_sig_gpc

48

## Significant GPC count

### Significant FET

In [21]:
n_sig_fet = dashboard_fet['nsig'].sum()

n_sig_fet

np.int64(233)

### Significant measurement or survival

In [22]:
sig_measurement_or_survival = dashboard_measurement.loc[dashboard_measurement['pval'] < sig_cutoff].index
n_sig_measurement_or_survival = len(sig_measurement_or_survival)

n_sig_measurement_or_survival

20

### Significant GPC count

In [23]:
n_sig_gpcs = n_sig_fet + n_sig_measurement_or_survival
n_sig_gpcs

np.int64(253)

# Manuscript sections

## Abstract

In [24]:
textwrap.wrap(
f'''We applied GPSEA to {n_cohorts} cohorts with {n_individuals} previously published individuals
with variants in one of {n_genes} genes associated with {n_diseases} Mendelian diseases
and identified {n_sig_gpcs} significant GPCs,
with {n_cohorts_w_sig_gpc} cohorts having at least one statistically significant GPC.
''',
)

['We applied GPSEA to 85 cohorts with 6179 previously published',
 'individuals with variants in one of 81 genes associated with 122',
 'Mendelian diseases and identified 253 significant GPCs, with 48',
 'cohorts having at least one statistically significant GPC.']

## Introduction

In [25]:
textwrap.wrap(
f"""We applied the software to {n_cohorts} cohorts,
{n_cohorts_w_sig_gpc} ({n_cohorts_w_sig_gpc/n_cohorts:.0%}) of which had at least one statistically significant GPC
(there were a total of {n_sig_gpcs} statistically significant results)
""")

['We applied the software to 85 cohorts, 48 (56%) of which had at least',
 'one statistically significant GPC (there were a total of 253',
 'statistically significant results)']

## Material and methods

### Input data

Phenopacket Store statistics are summarized in [PhenopacketStoreStats](https://github.com/monarch-initiative/phenopacket-store/blob/0.1.25/PhenopacketStoreStats.ipynb) notebook.

### Cohorts

In [26]:
textwrap.wrap(
f"""A total of {n_cohorts} cohorts were chosen for GPSEA analysis from version {version_ps} of Phenopacket Store.
The cohorts had a mean of {cohort_individuals.mean()} individuals (median {cohort_individuals.median()}, minimum {cohort_individuals.min()}, maximum {cohort_individuals.max()}).
The cohorts comprised information in {n_individuals} individuals.
Information on the sex of participants was available for {n_w_sex_info/n_individuals:.1%} of these individuals,
with {n_males_females['males'] / n_males_females.sum():.0%} being male
and {n_males_females['females'] / n_males_females.sum():.0%} female.
""", 
)

['A total of 85 cohorts were chosen for GPSEA analysis from version',
 '0.1.25 of Phenopacket Store. The cohorts had a mean of 77.8',
 'individuals (median 49.0, minimum 16, maximum 462). The cohorts',
 'comprised information in 6179 individuals. Information on the sex of',
 'participants was available for 82.6% of these individuals, with 53%',
 'being male and 47% female.']

## Results

### Table 1

In [27]:
stat_procedures = [
    "Categorical analysis",
    "t test",
    "HPO onset",
    "Disease onset",
    "Mortality",
    "Phenotype scores",
    "Disease diagnosis",
    "Sex differences",
]
stat_proc2cohorts_tested = {k: set() for k in stat_procedures}
stat_proc2tests_performed = {k: 0 for k in stat_procedures}
stat_proc2sig_tests = {k: 0 for k in stat_procedures}

tableS10_sig = pd.DataFrame(
    data=0,
    index=cohorts,
    columns=stat_procedures,
)

#### FET

In [28]:
male_female = ("MALE", "FEMALE")

sex_diffs = "Sex differences"
dg_diff = "Disease diagnosis"
cat_analysis = "Categorical analysis"

for cohort_name, row in dashboard_fet.iterrows():
    if row['a_genotype'] in male_female and row['b_genotype'] in male_female:
        # Testing for sex differences
        stat_proc2cohorts_tested[sex_diffs].add(cohort_name)
        stat_proc2tests_performed[sex_diffs] += row["total_hpo_tested"]
        stat_proc2sig_tests[sex_diffs] += row["nsig"]

        tableS10_sig.loc[cohort_name, sex_diffs] += row["nsig"] # type: ignore
    elif row['a_genotype'].startswith("OMIM:") and row["b_genotype"].startswith("OMIM:"):
        # Testing for differences in phenotypes between disease diagnoses
        stat_proc2cohorts_tested[dg_diff].add(cohort_name)
        stat_proc2tests_performed[dg_diff] += row["total_hpo_tested"]
        stat_proc2sig_tests[dg_diff] += row["nsig"]

        tableS10_sig.loc[cohort_name, dg_diff] += row["nsig"] # type: ignore
    else:
        # Categorical analysis
        stat_proc2cohorts_tested[cat_analysis].add(cohort_name)
        stat_proc2tests_performed[cat_analysis] += row["total_hpo_tested"]
        stat_proc2sig_tests[cat_analysis] += row["nsig"]

        tableS10_sig.loc[cohort_name, cat_analysis] += row["nsig"] # type: ignore

#### Measurements

In [29]:
def test_name2_stat_procedure(name: str) -> str:
    if name.startswith("Onset"):
        if "OMIM" in name:
            return "Disease onset"
        else:
            return "HPO onset"
    elif name == "Age of death":
        return "Mortality"
    elif name == "t-test":
        return "t test"
    elif name.endswith("Score") or name.endswith("Count"):
        return "Phenotype scores"
    else:
        raise ValueError(f"Unexpected test name {name}")

# Not covering the following since they are not stored in the measurement dashboard:
# - Sex differences
# - Disease diagnosis
# - Categorical analysis

In [30]:

for cohort_name, row in dashboard_measurement.iterrows():
    procedure = test_name2_stat_procedure(row['test_name'])
    stat_proc2cohorts_tested[procedure].add(cohort_name)
    stat_proc2tests_performed[procedure] += 1
    if row['pval'] < sig_cutoff:
        stat_proc2sig_tests[procedure] += 1
        tableS10_sig.loc[cohort_name, procedure] += 1 # type: ignore

In [31]:
table1 = pd.DataFrame(
    {
        'Cohorts tested': {k: len(cohorts) for k, cohorts in stat_proc2cohorts_tested.items()},
        'Tests performed': stat_proc2tests_performed,
        'Significant tests': stat_proc2sig_tests,
    }
)
table1

Unnamed: 0,Cohorts tested,Tests performed,Significant tests
Categorical analysis,78,6736,217
t test,2,3,3
HPO onset,4,6,3
Disease onset,10,11,6
Mortality,3,3,1
Phenotype scores,6,9,7
Disease diagnosis,8,266,15
Sex differences,44,1979,1


In [32]:
n_cohorts, table1['Tests performed'].sum(), table1['Significant tests'].sum()

(85, np.int64(9013), np.int64(253))

### Check Supplemental tables S3-9

Ensure the counts of significant tests are consistent between the supplemental tables and the `Significant tests` column.
The tables S3-9 list only the significant findings.

The review is done manually, by looking at `supplement_tables.tex` and the `table1` data frame.

### Supplemental table S10

#### Significant GPC counts

In [33]:
# Check that the cohorts with at least one significant GPC match
# between Table S10 and the results derived from dashboards.

assert set(cohorts_w_sig_gpc) == set(tableS10_sig.index[tableS10_sig.sum(axis=1) > 0])

In [34]:
assert tableS10_sig.loc[cohorts_w_sig_gpc].sum(axis=1).sum() == n_sig_gpcs
assert len(tableS10_sig.loc[cohorts_w_sig_gpc]) == n_cohorts_w_sig_gpc

#### Published GPC counts

The numbers were ascertained by manual literature review.

In [35]:
#                                   |<--  Survival -->|
#   Cohort  Sig  Pub  Catg    t tst HPO   Dis.  Mort. Pscr  DisDg SexDiff
published_gpcs_latex = """
ACADM       &  2 &  2 &   -   & 2/2 &  -  &  -  &  -  &  -  &  -  &  -  \\
AIRE        &  1 &  1 &   -   &  -  & 1/1 &  -  &  -  &  -  &  -  &  -  \\
ANKRD11     &  2 &  1 &   -   &  -  &  -  &  -  &  -  & 1/2 &  -  &  -  \\
ATP13A2     &  2 &  2 &   -   &  -  &  -  &  -  &  -  &  -  & 2/2 &  -  \\
CHD8        &  2 &  1 &   -   &  -  &  -  &  -  &  -  & 1/2 &  -  &  -  \\
CLDN16      &  1 &  1 &   -   &  -  & 1/1 &  -  &  -  &  -  &  -  &  -  \\
CNTNAP2     &  1 &  0 &   -   &  -  &  -  & 0/1 &  -  &  -  &  -  &  -  \\
CTCF        &  1 &  0 &   -   &  -  &  -  &  -  &  -  & 0/1 &  -  &  -  \\
CYP21A2     &  1 &  1 &   -   & 1/1 &  -  &  -  &  -  &  -  &  -  &  -  \\
EHMT1       &  1 &  1 &  1/1  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
FBN1        & 14 & 14 & 14/14 &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
FBXL4       &  2 &  0 &  0/1  &  -  &  -  & 0/1 &  -  &  -  &  -  &  -  \\
FGD1        &  1 &  1 &  1/1  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
GLI3        & 14 & 14 & 14/14 &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
HMGCS2      &  1 &  0 &    -  &  -  &  -  & 0/1 &  -  &  -  &  -  &  -  \\
IKZF1       &  2 &  2 &  2/2  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
ITPR1       & 21 &  0 &  0/21 &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
Kabuki      &  2 &  0 &    -  &  -  &  -  &  -  &  -  &  -  & 0/2 &  -  \\
KCNH5       &  1 &  1 &  1/1  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
KDM6A       &  2 &  1 &  1/1  &  -  &  -  &  -  &  -  &  -  &  -  & 0/1 \\
LDS 1 and 3 &  4 &  0 &    -  &  -  &  -  &  -  &  -  &  -  & 0/4 &  -  \\
LDS 3 and 6 &  1 &  0 &    -  &  -  &  -  &  -  &  -  &  -  & 0/1 &  -  \\
LMNA        & 36 & 36 & 35/35 &  -  &  -  &  -  &  -  & 1/1 &  -  &  -  \\
MPV17       &  3 &  1 &  0/1  &  -  &  -  & 0/1 & 1/1 &  -  &  -  &  -  \\
NBAS        &  1 &  0 &  0/1  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
NF1         & 37 & 37 & 37/37 &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
PTPN11      &  4 &  0 &  0/4  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
RERE        &  1 &  1 &   -   &  -  &  -  &  -  &  -  & 1/1 &  -  &  -  \\
Robinow     &  4 &  0 &   -   &  -  &  -  &  -  &  -  &  -  & 0/4 &  -  \\
RPGRIP1     &  3 &  1 &  1/1  &  -  &  -  &  -  &  -  &  -  & 0/2 &  -  \\
SAMD9L      &  3 &  0 &  0/3  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
SATB2       &  1 &  0 &  0/1  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
SCN2A       &  9 &  9 &  9/9  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
SCO2        &  1 &  0 &  0/1  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
SETD2       &  9 &  8 &  8/8  &  -  &  -  & 0/1 &  -  &  -  &  -  &  -  \\
SMAD3       &  1 &  0 &  0/1  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
SMARCB1     &  6 &  0 &  0/6  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
SMARCC2     &  1 &  1 &  1/1  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
SPTAN1      & 35 & 35 & 35/35 &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
SUOX        &  2 &  1 &  1/1  &  -  &  -  & 0/1 &  -  &  -  &  -  &  -  \\
TBCK        &  2 &  0 &  0/2  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
TBX1        &  2 &  0 &  0/2  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
TBX5        &  3 &  3 &  3/3  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
TGFBR1      &  4 &  1 &  1/4  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
UMOD        &  3 &  3 &  2/2  &  -  & 1/1 &  -  &  -  &  -  &  -  &  -  \\
WWOX        &  1 &  1 &  1/1  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
ZFX         &  1 &  1 &  1/1  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
ZMYM3       &  1 &  0 &  0/1  &  -  &  -  &  -  &  -  &  -  &  -  &  -  \\
""".splitlines()[1:] # throw away the first blank line

In [36]:
fraction_re = re.compile(r'(?P<published>\d+)/(?P<total>\d+)')

def parse_value(val: str) -> int:
    stripped = val.strip()
    matcher = fraction_re.match(stripped)
    if matcher:
        return int(matcher.group("published"))
    elif stripped == '-':
        return 0
    else:
        raise ValueError(f"Unexpected value {val}")


In [37]:
tableS10_published = pd.DataFrame(
    data=0,
    index=cohorts,
    columns=stat_procedures,
)

# 0              1    2     3     4       5    6    7      8     9     10
# Cohort         Sig  Pub   Catg  t tst   HPO  Dis. Mort.  Pscr  DisDg SexDiff

for row in published_gpcs_latex:
    cols = row.replace("\\", "").split('&')
    assert len(cols) == 11, f"{len(cols)} should be 11"
    cohort_name = cols[0].strip()

    tableS10_published.loc[cohort_name, cat_analysis] += parse_value(cols[3]) # type: ignore
    tableS10_published.loc[cohort_name, 't test'] += parse_value(cols[4]) # type: ignore
    tableS10_published.loc[cohort_name, 'HPO onset'] += parse_value(cols[5]) # type: ignore
    tableS10_published.loc[cohort_name, 'Disease onset'] += parse_value(cols[6]) # type: ignore
    tableS10_published.loc[cohort_name, 'Mortality'] += parse_value(cols[7]) # type: ignore
    tableS10_published.loc[cohort_name, 'Phenotype scores'] += parse_value(cols[8]) # type: ignore
    tableS10_published.loc[cohort_name, dg_diff] += parse_value(cols[9]) # type: ignore
    tableS10_published.loc[cohort_name, sex_diffs] += parse_value(cols[10]) # type: ignore

tableS10_published.head(10)

Unnamed: 0_level_0,Categorical analysis,t test,HPO onset,Disease onset,Mortality,Phenotype scores,Disease diagnosis,Sex differences
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ABCB7,0,0,0,0,0,0,0,0
ACADM,0,2,0,0,0,0,0,0
ACBD6,0,0,0,0,0,0,0,0
AIRE,0,0,1,0,0,0,0,0
ANKRD11,0,0,0,0,0,1,0,0
ASPM,0,0,0,0,0,0,0,0
ATP13A2,0,0,0,0,0,0,2,0
ATP6V0C,0,0,0,0,0,0,0,0
BRD4,0,0,0,0,0,0,0,0
CHD8,0,0,0,0,0,1,0,0


In [38]:
tableS10 = pd.DataFrame(
    "",
    index=cohorts,
    columns=["Sig", "Pub"] + stat_procedures
)

for cohort_name in cohorts:
    for procedure in stat_procedures:
        nsig = tableS10_sig.loc[cohort_name, procedure]
        if nsig > 0:
            pub = tableS10_published.loc[cohort_name, procedure]
            tableS10.loc[cohort_name, procedure] = f"{pub}/{nsig}"
        else:
            tableS10.loc[cohort_name, procedure] = "-"
    tableS10.loc[cohort_name, "Sig"] = tableS10_sig.loc[cohort_name].sum()
    tableS10.loc[cohort_name, "Pub"] = tableS10_published.loc[cohort_name].sum()

tableS10.loc[cohorts_w_sig_gpc].head(2)

Unnamed: 0_level_0,Sig,Pub,Categorical analysis,t test,HPO onset,Disease onset,Mortality,Phenotype scores,Disease diagnosis,Sex differences
cohort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ACADM,2,2,-,2/2,-,-,-,-,-,-
AIRE,1,1,-,-,1/1,-,-,-,-,-


Generate LaTeX-like content for the Supplemental Table S10 body. The content is to be copy-pasted into `supplement_tables.tex`.

In [39]:
latex_lines = []

for cohort_name, row in tableS10.loc[cohorts_w_sig_gpc].iterrows():
    latex_lines.append(
        " & ".join(
            [cohort_name] + [f"{val}" for val in row]
        ) + " \\\\"
    )
print(os.linesep.join(latex_lines))

ACADM & 2 & 2 & - & 2/2 & - & - & - & - & - & - \\
AIRE & 1 & 1 & - & - & 1/1 & - & - & - & - & - \\
ANKRD11 & 2 & 1 & - & - & - & - & - & 1/2 & - & - \\
ATP13A2 & 2 & 2 & - & - & - & - & - & - & 2/2 & - \\
CHD8 & 2 & 1 & - & - & - & - & - & 1/2 & - & - \\
CLDN16 & 1 & 1 & - & - & 1/1 & - & - & - & - & - \\
CNTNAP2 & 1 & 0 & - & - & - & 0/1 & - & - & - & - \\
CTCF & 1 & 0 & - & - & - & - & - & 0/1 & - & - \\
CYP21A2 & 1 & 1 & - & 1/1 & - & - & - & - & - & - \\
EHMT1 & 1 & 1 & 1/1 & - & - & - & - & - & - & - \\
FBN1 & 14 & 14 & 14/14 & - & - & - & - & - & - & - \\
FBXL4 & 2 & 0 & 0/1 & - & - & 0/1 & - & - & - & - \\
FGD1 & 1 & 1 & 1/1 & - & - & - & - & - & - & - \\
GLI3 & 14 & 14 & 14/14 & - & - & - & - & - & - & - \\
HMGCS2 & 1 & 0 & - & - & - & 0/1 & - & - & - & - \\
IKZF1 & 2 & 2 & 2/2 & - & - & - & - & - & - & - \\
ITPR1 & 21 & 0 & 0/21 & - & - & - & - & - & - & - \\
KCNH5 & 1 & 1 & 1/1 & - & - & - & - & - & - & - \\
KDM6A & 2 & 1 & 1/1 & - & - & - & - & - & - & 0/1 \\
Kabuki & 2 & 

### Characterizing genotype-phenotype correlations with GPSEA

In [40]:
textwrap.wrap(
f"""We tested GPSEA on {n_cohorts} cohorts, covering {n_genes} genes and {n_diseases} diseases.
We first explain the algorithmic approaches to setting up GPC testing and then present
an overview of {n_sig_gpcs} significant correlations identified in the cohorts.
"""
)

['We tested GPSEA on 85 cohorts, covering 81 genes and 122 diseases. We',
 'first explain the algorithmic approaches to setting up GPC testing and',
 'then present an overview of 253 significant correlations identified in',
 'the cohorts.']

### Independent Filtering for Human Phenotype Ontology (IF-HPO)

In [41]:
n_hpo_pre_filter = dashboard_fet['total_hpo_testable']
n_hpo_post_filter = dashboard_fet['total_hpo_tested']

textwrap.wrap(
f"""IF-HPO reduces the total number of tested terms by over ten-fold in the cohorts analyzed here
(before filtering: mean {n_hpo_pre_filter.mean():.0f}, median {n_hpo_pre_filter.median()}, min {n_hpo_pre_filter.min()}, max {n_hpo_pre_filter.max()};
following independent filtering: mean {n_hpo_post_filter.mean():.0f}, median {n_hpo_post_filter.median()}, min {n_hpo_post_filter.min()}, max {n_hpo_post_filter.max()}).
""")

['IF-HPO reduces the total number of tested terms by over ten-fold in',
 'the cohorts analyzed here (before filtering: mean 304, median 276.5,',
 'min 45, max 967; following independent filtering: mean 40, median',
 '28.0, min 1, max 225).']

### GPCs are common in Mendelian disease

In [42]:
textwrap.wrap(
f"""We analyzed {n_cohorts} cohorts with {n_individuals} individuals
(median {cohort_individuals.median()} per cohort, range: {cohort_individuals.min()}-{cohort_individuals.max()})
with {n_diseases} Mendelian diseases.
""")

['We analyzed 85 cohorts with 6179 individuals (median 49.0 per cohort,',
 'range: 16-462) with 122 Mendelian diseases.']

In [43]:
f"A total of {n_sig_gpcs} significant correlations were identified."

'A total of 253 significant correlations were identified.'

In [44]:
f"Significant results were identified for {n_cohorts_w_sig_gpc} cohorts."

'Significant results were identified for 48 cohorts.'

In [45]:
cohorts_wo_pub_gpc = tableS10_published.index[tableS10_published.sum(axis=1) == 0].intersection(cohorts_w_sig_gpc)
cohorts_w_pub_gpc  = tableS10_published.index[tableS10_published.sum(axis=1) > 0].intersection(cohorts_w_sig_gpc)
n_cohorts_w_pub_gpc = len(cohorts_w_pub_gpc)

f"We identified previously published GPCs for {n_cohorts_w_pub_gpc} of these cohorts, many of which overlapped with our findings ..."

'We identified previously published GPCs for 29 of these cohorts, many of which overlapped with our findings ...'

In [46]:
nonpub_sig_gpc = tableS10_sig - tableS10_published
n_nonpub_sig_gpc = nonpub_sig_gpc.sum(axis=1).sum()
n_nonpub_sig_gpc

np.int64(71)

In [47]:
n_cohorts_w_nonpub_sig_gpc = n_cohorts_w_sig_gpc - n_cohorts_w_pub_gpc

textwrap.wrap(
f"""{n_nonpub_sig_gpc} significant findings in the remaining {n_cohorts_w_nonpub_sig_gpc}
cohorts represent candidate GPCs that should be validated by independent studies on validation cohorts ..."""
)

['71 significant findings in the remaining 19 cohorts represent',
 'candidate GPCs that should be validated by independent studies on',
 'validation cohorts ...']

## Distribution of phenotypic features with significant GPCs

See `scripts/README.md` > Proportions.

*-* EOF *-*