# GREGoR Data Tracking and Reporting for U05

In [None]:
# install modules
# %pip install terra-pandas

In [None]:
# import modules
import os
import io
import pandas as pd
import terra_pandas as tp
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn3_circles
import seaborn as sns
from functools import reduce
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import datetime
#from IPython.display import Markdown as md

In [None]:
# function(s) for reading in RC reporting

def readGoogleSheet(url):
    gsheet_url = url
    csv_export_url = gsheet_url.replace('/edit#gid=', '/export?format=csv&gid=')
    gsheet_df = pd.read_csv(csv_export_url)
    return gsheet_df

# function(s) for AnVIL data tables
def readDatatable(data_table, project, workspace): 
    new_table = tp.table_to_dataframe(data_table, workspace_namespace=project, workspace=workspace)
    return new_table

def formatIndex(df):
    mylist = []
    for i in df.index:
        if type(i) == tuple: 
            str = '_'.join(i)
            mylist.append(str)
        else: 
            mylist.append(i)
    df.index = mylist
    df.index = df.index.str.upper()
    return df

def gregorUploadWorkspaces(upload_cycle): 
    upload_workspaces = ['AnVIL_GREGoR_BCM_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_BCM_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_BROAD_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_BROAD_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_CNH_I_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_CNH_I_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_GSS_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_GSS_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_UW_CRDR_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_UW_CRDR_' + upload_cycle + '_GRU'
                         ]
    return upload_workspaces

def summarizeData(df, column):
    df['participant_id'] = df.index
    df = pd.DataFrame(df.groupby(by=[column], dropna=False)['participant_id'].count())
    df.columns = ['NO. OF PARTICIPANTS']
    formatIndex(df)
    df.columns = df.columns.str.upper()
    return df

def participantSummary(df):
    df['participant_id'] = df.index
    df = df.groupby(by=['gregor_center', 'consent_code'], as_index = True)[['participant_id', 'family_id']].nunique()
    df.loc["Total"] = df.sum()
    df.columns = ['participants', 'families']
    formatIndex(df)
    df.columns = df.columns.str.upper()
    return df

def experimentMerge(participant_df, analyte_df, experiment_df):
    df = pd.merge(participant_df, analyte_df, left_index = True, right_on='participant_id')
    df1 = pd.merge(df, experiment_df, left_index = True, right_on='analyte_id')
    return df1

def experimentSummary(participant_df, analyte_df, experiment_df):
    df1 = experimentMerge(participant_df, analyte_df, experiment_df)
    experiment_type = df1.groupby(by=['gregor_center','experiment_type'])['participant_id'].nunique().unstack()
    experiment_type = experiment_type.fillna(0).astype(int)
    experiment_type.loc["Total"] = experiment_type.sum()
    formatIndex(experiment_type)
    experiment_type.columns = experiment_type.columns.str.upper()
    experiment_type.columns.name = None
    return experiment_type

def phenotypeMerge(participant_df, phenotype):
    participant_terms = pd.DataFrame(phenotype.groupby('participant_id')['term_id'].count())
    participant_terms.index.name = None
    df = pd.merge(participant_df, participant_terms, left_index = True, right_index = True)
    return df

def combineDataTable(table, upload_workspaces):
    project = 'anvil-datastorage'
    combined_df = pd.DataFrame([])
    for ws in upload_workspaces:
        try: 
            df = readDatatable(table, project, ws)
            #print(ws)
            #print(project)
            #print(df.shape)
            combined_df = combined_df.append(df)
        except: 
            print(table + ' is missing in ' + ws)
    return combined_df

In [None]:
timestamp = pd.Timestamp(datetime.datetime.today())
print('Author: DCC')
print('Last edited: ' +  str(timestamp.date()))
print('-----------------------')

In [None]:
project = os.environ['WORKSPACE_NAMESPACE']
workspace = os.environ['WORKSPACE_NAME']
bucket = os.environ['WORKSPACE_BUCKET'] + "/" 

print("Current GREGoR upload cycle: U05")
# print("Terra Billing project: " + project)
print("Workspace: " + workspace)
print("Workspace storage bucket: " + bucket)
print('------------------------------------------------------------------------------')

In [None]:
# GREGoR combined RC reporting sheets for U2, U03
DNA_U02 = readGoogleSheet("https://docs.google.com/spreadsheets/d/1ndQRPsJW6d8kWIq2j9JHui9-sudrtcyDoUZ_qX821X8/edit#gid=0")
DNA_U03 = readGoogleSheet("https://docs.google.com/spreadsheets/d/1RPBfSwIP7Ev7gUCE4U7iG4fmIoEqCzUUxChsUNo_Pl4/edit#gid=0")
RNA_U03 = readGoogleSheet("https://docs.google.com/spreadsheets/d/1Fh81ta_h7NfB_4ibUSqMjzQNvWUxqsuo08IkxyypR00/edit#gid=0")

In [None]:
# GREGoR combined RC reporting sheets for U04
DNA_U04 = readGoogleSheet("https://docs.google.com/spreadsheets/d/1o4BSe5Gz91I2ZSU-gu1VrdELHBqAcMvfbP8fZkZGweA/edit#gid=0")
RNA_U04 = readGoogleSheet("https://docs.google.com/spreadsheets/d/1wloG-MlOpttSJk41ZnYE3eEX1dglEvL9PSudaG45H7k/edit#gid=0")
NANOPORE_U04 = readGoogleSheet("https://docs.google.com/spreadsheets/d/1PJ8y8fGmb7QQqSCtqXr0BPbI3KISRy0-Ai6wF5sQeRg/edit#gid=0")

In [None]:
# GREGoR combined RC reporting sheets
DNA = readGoogleSheet("https://docs.google.com/spreadsheets/d/1O_AHrHncsHF1XsKO-hiRV_-e_sKsRyZpUyyxM8CzQw0/edit#gid=0")
RNA = readGoogleSheet("https://docs.google.com/spreadsheets/d/1W8YX7LIXtjOsfo8Q8m5vHqC7-5RE5kfPsZ8Az4VfleM/edit#gid=0")
NANOPORE = readGoogleSheet("https://docs.google.com/spreadsheets/d/1VRJgY0LXYqpn6HMpKU1z6436ELydoISBdsgMVyS8JmM/edit#gid=0")
PACBIO = readGoogleSheet("https://docs.google.com/spreadsheets/d/1oD5p7yxFAhsaD4MtjF6U1asP-SOcr8L3yloHTnEaWCE/edit#gid=0")

In [None]:
# read in AnVIL tables from combined consortium
participant = readDatatable('participant', project, workspace)
family = readDatatable('family', project, workspace)
phenotype = readDatatable('phenotype', project, workspace)
analyte = readDatatable('analyte', project, workspace)

experiment_dna_short_read = readDatatable('experiment_dna_short_read', project, workspace)
experiment_rna_short_read = readDatatable('experiment_rna_short_read', project, workspace)
experiment_nanopore = readDatatable('experiment_nanopore', project, workspace)

aligned_dna_short_read = readDatatable('aligned_dna_short_read', project, workspace)
aligned_rna_short_read = readDatatable('aligned_rna_short_read', project, workspace)
aligned_nanopore = readDatatable('aligned_nanopore', project, workspace)

genetic_findings = readDatatable('genetic_findings', project, workspace)

In [None]:
participant_U04 = readDatatable('participant', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U04')
family_U04 = readDatatable('family', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U04')
aligned_dna_short_read_U04 = readDatatable('aligned_dna_short_read', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U04')
aligned_rna_short_read_U04 = readDatatable('aligned_rna_short_read', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U04')
aligned_nanopore_U04 = readDatatable('aligned_nanopore', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U04')

participant_U03 = readDatatable('participant', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U03')
family_U03 =readDatatable('family', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U03')
aligned_dna_short_read_U03 = readDatatable('aligned_dna_short_read', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U03')
aligned_rna_short_read_U03 = readDatatable('aligned_rna_short_read', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U03')

participant_U02 = readDatatable('participant', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U02')
family_U02 = readDatatable('family', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U02')
aligned_dna_short_read_U02 = readDatatable('aligned_dna_short_read', 'gregor-dcc', 'GREGOR_COMBINED_CONSORTIUM_U02')

## Overview

The GREGoR Data Tracking Report provides summaries of data contributed to the GREGoR Consortium by member Research Centers (RCs). Information regarding data generation and data deposited to AnVIL are derived from the RC quarterly reports and the GREGoR Combined Consortium Workspace. Graphical and tabular summaries of participant, family, experiment, and phenotype information are generated from information provided by member Research Centers (RCs) and specified in the GREGoR data model.

__Abbreviations:__ <br>
__RCs:__ BCM = Baylor College of Medicine Research Center; BROAD = Broad Institute; CNH_I = Children's National Hospital/Invitae; GSS = GREGoR Stanford Site; UW-CRDR = University of Washington Center for Rare Disease Research. <br>
__Consent codes:__ GRU = General research use and clinical care; HMB = Health/medical/biomedical research and clinical care

In [None]:
d = {'participant': [participant_U02.shape[0], participant_U03.shape[0], participant_U04.shape[0], participant.shape[0]], 
                      'family': [family_U02.shape[0], family_U03.shape[0],family_U04.shape[0], family.shape[0]], 
                      'aligned_dna_short_read': [aligned_dna_short_read_U02.shape[0], aligned_dna_short_read_U03.shape[0], aligned_dna_short_read_U04.shape[0],  aligned_dna_short_read.shape[0]], 
                      'aligned_rna_short_read': [None, aligned_rna_short_read_U03.shape[0], aligned_rna_short_read_U04.shape[0],  aligned_rna_short_read.shape[0]], 
                      'aligned_nanopore': [None, None,  aligned_nanopore_U04.shape[0], aligned_nanopore.shape[0]]}
data_growth_df = pd.DataFrame(d) 
data_growth_df.index = ['U02', 'U03', 'U04', 'U05']
data_growth_df
sns.lineplot(data = data_growth_df, markers=True, dashes=True, marker='o',  markersize=10, legend="auto", palette = 'bright')
#plt.xlabel('UPLOAD_CYCLE')
#plt.ylabel('COUNT')

plt.legend(facecolor = "white", loc = 'lower right', 
           bbox_to_anchor=(0.5, 0.4, 1.3, 0.6), fontsize = 'large', shadow = True)

plt.show()

__Figure 1.__ The size of the GREGoR Combined Consortium Dataset across 'U02', 'U03', 'U04', and 'U05' upload cycles.

## Participants and Families

__Table 1. The number of participants and families in the GREGoR Combined Consortium Dataset.__

In [None]:
participant_data = participantSummary(participant)
participant_data.index.name = 'GREGoR_CENTER'
participant_data

In [None]:
family_size = summarizeData(participant, 'family_id')
family_size.columns = ['FAMILY SIZE']
plt.figure(figsize=(5,3))

sns.set(style="white", font='sans-serif', font_scale=1.2)

sns.histplot(data = family_size, x = 'FAMILY SIZE', color='#076839', edgecolor = 'black', discrete= True)
plt.ylabel('COUNT')
#plt.title('Distribution of Family Size in the GREGoR Combined Consortium Dataset')

plt.show()

__Figure 2.__ The distribution of family size in the GREGoR Combined Consortium Dataset.

__Table 2. Summary of participant relationship to the proband.__

In [None]:
proband_rel_data = summarizeData(participant, 'proband_relationship')
proband_rel_data.index.name = 'PROBAND_RELATIONSHIP'
proband_rel_data.loc['TOTAL'] = proband_rel_data.sum()
proband_rel_data.index = proband_rel_data.index.str.replace('SELF', 'PROBAND')
proband_rel_data

In [None]:
# plot pie chart
proband_rel_data = summarizeData(participant, 'proband_relationship')
df = proband_rel_data.loc[proband_rel_data['NO. OF PARTICIPANTS'] > 10]
sub = proband_rel_data.loc[proband_rel_data['NO. OF PARTICIPANTS'] <= 10]
sub_sum = sub['NO. OF PARTICIPANTS'].sum()
sub_sum
s = df[df.index == 'OTHER']
s = s['NO. OF PARTICIPANTS']
s
val = s[0]
new_val = sub_sum + val
df.index = df.index.str.replace('SELF', 'PROBAND')

In [None]:
df = df.replace(val, new_val)
#df.index = df.index.str.replace('self', 'proband')
labels = df.index.str.lower()

color = ['#076839', '#388660', '#519574', '#6aa488', '#9bc2af']

plt.figure(figsize=(5, 5))
plt.pie(df['NO. OF PARTICIPANTS'], labels = labels, colors = color, autopct='%.2f%%', labeldistance=1.25, 
        radius=1.25, 
        textprops={'fontsize': 10, 'fontname' : 'serif', 'ha' : 'center' , 'color' : 'black'},
        wedgeprops={ 'linewidth' : 1.5, 'edgecolor' : "white" })
plt.tight_layout()
plt.show()

__Figure 3.__ Pie chart showing the percentage of proband relationships in the GREGoR Combined Consortium Dataset. Note: Proband relationship was placed into 'other' if count <= 10.

__Table 3. The number of female and male participants in the GREGoR Combined Consortium Dataset.__

In [None]:
sex = summarizeData(participant, 'sex')
sex['%'] = (sex['NO. OF PARTICIPANTS']/sex['NO. OF PARTICIPANTS'].sum()).round(2)
sex

## Pipeline and Experiment Summaries

### Short-read DNA data

In [None]:
plt.figure(figsize=(14, 10))
plt.subplots_adjust(hspace=0.4)

sns.set(style="white", font='sans-serif', font_scale=1.2)


rc_list = ['BCM', 'BROAD', 'CNH_I', 'GSS', 'UW_CRDR']

for n, rc in enumerate(rc_list):
    ax = plt.subplot(3, 2, n + 1)
    rc_internal_U02 = DNA_U02[(DNA_U02["GREGoR_CENTER"] == rc) & (DNA_U02["EXPERIMENT_LOC"] == "Internal")]
    rc_internal_U03 = DNA_U03[(DNA_U03["GREGoR_CENTER"] == rc) & (DNA_U03["EXPERIMENT_LOC"] == "Internal")]
    rc_internal_U04 = DNA_U04[(DNA_U04["GREGoR_CENTER"] == rc) & (DNA_U04["EXPERIMENT_LOC"] == "Internal")]
    rc_internal_U05 = DNA[(DNA["GREGoR_CENTER"] == rc) & (DNA["EXPERIMENT_LOC"] == "Internal")]
   
    frames = [rc_internal_U02, rc_internal_U03, rc_internal_U04, rc_internal_U05]
    rc_df = pd.concat(frames, axis = 0)
    rc_df = rc_df[['TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
    rc_df.columns = ['PREPARED', 'SEQUENCED', 'UPLOADED']
    rc_df.index = ['U02', 'U03', 'U04', 'U05']
    #print(rc_df)
    sns.lineplot(data = rc_df, markers=True, dashes= False, marker='o',  markersize=10, palette = 'Paired', legend = 'auto')
    plt.title(rc)

#plt.legend(facecolor = "white", loc = 'lower right', bbox_to_anchor=(0.5, 0.6, 1.1, 0.6), 
           #fontsize = 'large', shadow = True)

#plt.show()

__Figure 4.__ Summary of quarterly reports completed by the GREGoR RCs to track internally processed short-read DNA experiments. Line plots show samples prepared, samples that completed sequencing and samples uploaded to AnVIL over different upload cycles.

__Table 4. Summary of RC quarterly reports for short read DNA experiments in the current upload cycle.__



In [None]:
df = DNA
df.replace('-', '', inplace = True)
df.index = df[['GREGoR_CENTER', 'EXPERIMENT_LOC']]

df_sub = df[['TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 
                                      'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
df_sub

__Table 5. The number of short-read DNA exomes and genomes.__ <br>
Note: These numbers are derived from the experiment_dna_short_read table.

In [None]:
experiment_type_by_center = experimentSummary(participant, analyte, experiment_dna_short_read)
experiment_type_by_center.index.name = 'GREGoR_CENTER'
experiment_type_by_center

__Table 6. The number of short-read DNA aligned sequencing files (i.e. BAMS or CRAMs).__ <br>
Note: These numbers are derived from the 'aligned_dna_short_read' data table.

In [None]:
df = experimentMerge(participant, analyte, experiment_dna_short_read)
participant_aligned = pd.merge(df, aligned_dna_short_read, left_index = True, right_on='experiment_dna_short_read_id')
aligned_files_by_center = participant_aligned.groupby(by=['gregor_center'])[['aligned_dna_short_read_file']].nunique()
aligned_files_by_center.loc["Total"] = aligned_files_by_center.sum()
aligned_files_by_center.columns = ['No. of short read DNA files']
    
formatIndex(aligned_files_by_center)
aligned_files_by_center.columns = aligned_files_by_center.columns.str.upper()
aligned_files_by_center.index.name = 'GREGoR_CENTER'
aligned_files_by_center

### Short-read RNA data

In [None]:
plt.figure(figsize=(14, 10))
plt.subplots_adjust(hspace=0.4)

sns.set(style="white", font='sans-serif', font_scale=1.2)


rc_list = ['BCM', 'BROAD', 'CNH_I', 'GSS']

for n, rc in enumerate(rc_list):
    ax = plt.subplot(3, 2, n + 1)
    rc_internal_U03 = RNA_U03[(RNA_U03["GREGoR_CENTER"] == rc) & (RNA_U03["EXPERIMENT_LOC"] == "Internal")]
    rc_internal_U04 = RNA_U04[(RNA_U04["GREGoR_CENTER"] == rc) & (RNA_U04["EXPERIMENT_LOC"] == "Internal")]
    rc_internal_U05 = RNA[(RNA["GREGoR_CENTER"] == rc) & (RNA["EXPERIMENT_LOC"] == "Internal")]
   
    frames = [rc_internal_U03, rc_internal_U04, rc_internal_U05]
    rc_df = pd.concat(frames, axis = 0)
    rc_df = rc_df[['TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
    rc_df.columns = ['PREPARED', 'SEQUENCED', 'UPLOADED']
    rc_df.index = ['U03', 'U04', 'U05']
    #print(rc_df)
    sns.lineplot(data = rc_df, markers=True, dashes= False, marker='o',  markersize=10, palette = 'Paired', legend = 'auto')
    plt.title(rc)

#plt.legend(facecolor = "white", loc = 'lower right', bbox_to_anchor=(0.5, 0.6, 1.1, 0.6), 
           #fontsize = 'large', shadow = True)

#plt.show()

__Figure 5.__ Summary of quarterly reports completed by the GREGoR RCs to track short-read RNA experiments. Line plots show samples prepared, samples that completed sequencing and samples uploaded to AnVIL.

_Note: UW-CRDR is not included in Figure 5 because the UW-CRDR has contributed 1 short-read RNA sample (see table below)._ 

__Table 7. Summary of RC quarterly reports for short-read RNA experiments in the current upload cycle.__

In [None]:
df = RNA
df.replace('-', '', inplace = True)
df.index = df[['GREGoR_CENTER', 'EXPERIMENT_LOC']]

df_sub = df[['TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 
                                      'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
df_sub

__Table 8. The number of short-read RNA sequencing experiments.__ <br>
Note: These numbers are derived from the experiment_rna_short_read table

In [None]:
experiment_type_by_center = experimentSummary(participant, analyte, experiment_rna_short_read)
experiment_type_by_center.index.name = 'GREGoR_CENTER'
experiment_type_by_center

__Table 9. The number of aligned short-read RNA sequencing files.__ <br>
Note: These numbers are derived from the 'aligned_rna_short_read' data table.

In [None]:
df = experimentMerge(participant, analyte, experiment_rna_short_read)
participant_aligned = pd.merge(df, aligned_rna_short_read, left_index = True, right_on='experiment_rna_short_read_id')
aligned_files_by_center = participant_aligned.groupby(by=['gregor_center'])[['aligned_rna_short_read_file']].nunique()
aligned_files_by_center.loc["Total"] = aligned_files_by_center.sum()
aligned_files_by_center.columns = ['No. of short read RNA files']
    
formatIndex(aligned_files_by_center)
aligned_files_by_center.columns = aligned_files_by_center.columns.str.upper()
aligned_files_by_center.index.name = 'GREGoR_CENTER'
aligned_files_by_center

### Long-read nanopore data

In [None]:
plt.figure(figsize=(14, 10))
plt.subplots_adjust(hspace=0.4)

sns.set(style="white", font='sans-serif', font_scale=1.2)


rc_list = ['BCM', 'UW_CRDR']

for n, rc in enumerate(rc_list):
    ax = plt.subplot(3, 2, n+1)
    rc_internal_U04 = NANOPORE_U04[(NANOPORE_U04["GREGoR_CENTER"] == rc) & (NANOPORE_U04["EXPERIMENT_LOC"] == "Internal")]
    rc_internal_U05 = NANOPORE[(NANOPORE["GREGoR_CENTER"] == rc) & (NANOPORE["EXPERIMENT_LOC"] == "Internal")]
   
    frames = [rc_internal_U04, rc_internal_U05]
    rc_df = pd.concat(frames, axis = 0)
    rc_df = rc_df[['TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
    rc_df.columns = ['PREPARED', 'SEQUENCED', 'UPLOADED']
    if len(rc_df) != 2:
        pass
    else: 
        rc_df.index = ['U04', 'U05']
        sns.lineplot(data = rc_df, markers=True, dashes= False, marker='o',  markersize=10, palette = 'Paired', legend = 'auto')
        plt.title(rc)

#plt.legend(facecolor = "white", loc = 'lower right', bbox_to_anchor=(0.5, 0.6, 1.1, 0.6), 
           #fontsize = 'large', shadow = True)

#plt.show()

__Figure 6.__ Summary of quarterly reports completed by the GREGoR RCs to track internally processed long-read DNA  (nanopore) experiments. Line plots show samples prepared, samples that completed sequencing and samples uploaded to AnVIL over different upload cycles.

_Note: Broad, CNH-I and GSS are not included in Figure 6 because they have not contributed internal nanopore data (see table below)_

__Table 10. Summary of RC quarterly reports for Nanopore experiments in the current upload cycle.__

In [None]:
df = NANOPORE
df.replace('-', '', inplace = True)
df.index = df[['GREGoR_CENTER', 'EXPERIMENT_LOC']]

df_sub = df[['TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 
                                      'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
df_sub

__Table 11. The number of long-read nanopore experiments.__ <br>
Note: These numbers are derived from the experiment_nanopore table

In [None]:
experiment_type_by_center = experimentSummary(participant, analyte, experiment_nanopore)
experiment_type_by_center.index.name = 'GREGoR_CENTER'
experiment_type_by_center

__Table 12: The number of aligned long-read DNA (nanopore) sequencing files.__

In [None]:
df = experimentMerge(participant, analyte, experiment_nanopore)
participant_aligned = pd.merge(df, aligned_nanopore, left_index = True, right_on='experiment_nanopore_id')
aligned_files_by_center = participant_aligned.groupby(by=['gregor_center'])[['aligned_nanopore_file']].nunique()
aligned_files_by_center.loc["Total"] = aligned_files_by_center.sum()
aligned_files_by_center.columns = ['No. of long read Nanopore files']
    
formatIndex(aligned_files_by_center)
aligned_files_by_center.columns = aligned_files_by_center.columns.str.upper()
aligned_files_by_center.index.name = 'GREGoR_CENTER'
aligned_files_by_center

### Long-read pacbio data

_Note: PacBio data files have not been uploaded to AnVIL_

__Table 13.__ Summary of RC quarterly reports for PacBio experiments in the current upload cycle.

In [None]:
df = PACBIO
df.replace('-', '', inplace = True)
df.index = df[['GREGoR_CENTER', 'EXPERIMENT_LOC']]

df_sub = df[['TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 
                                      'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
df_sub

### Samples with multiple data types

In [None]:
df_dna = experimentMerge(participant, analyte, experiment_dna_short_read)
df_rna = experimentMerge(participant, analyte, experiment_rna_short_read)
df_nanopore = experimentMerge(participant, analyte, experiment_nanopore)

In [None]:
set1 = set(df_dna['participant_id'])
set2 = set(df_rna['participant_id'])
set3 = set(df_nanopore['participant_id'])

In [None]:
s1_only = set1.difference(set2).difference(set3)
#len(s1_only)

In [None]:
s2_only = set2.difference(set1).difference(set3)
#len(s2_only)

In [None]:
s3_only = set3.difference(set1).difference(set2)
#len(s3_only)

In [None]:
s1s2 = set1.intersection(set2)
#len(s1s2)

In [None]:
plt.figure(figsize=(5,5))

v = venn3(subsets=(5, 2, 1, 1, 1, 1, 1), set_labels = ('dna_short_read', 'rna_short_read', 'dna_nanopore'))
v.get_patch_by_id('100').set_alpha(0.8)
v.get_patch_by_id('100').set_color('#076839')
v.get_label_by_id('100').set_text('2924')

v.get_patch_by_id('110').set_alpha(0.75)
v.get_patch_by_id('110').set_color('#0db53f')
v.get_label_by_id('110').set_text('430')


v.get_patch_by_id('010').set_alpha(0.5)
v.get_patch_by_id('010').set_color('#076839')
v.get_label_by_id('010').set_text('18')

v.get_patch_by_id('111').set_alpha(0.1)
v.get_patch_by_id('111').set_color('#0db53f')
v.get_label_by_id('111').set_text('12')

v.get_patch_by_id('001').set_alpha(0.2)
v.get_patch_by_id('001').set_color('#076839')
v.get_label_by_id('001').set_text('40')

v.get_patch_by_id('011').set_alpha(0.3)
v.get_patch_by_id('011').set_color('#0db53f')
v.get_label_by_id('011').set_text('1')

v.get_patch_by_id('101').set_alpha(0.3)
v.get_patch_by_id('101').set_color('#076839')
v.get_label_by_id('101').set_text('9')

c = venn3_circles(subsets=(5, 2, 1, 1, 1, 1, 1), linestyle='dotted', linewidth=2.0)


plt.show()

__Figure 7.__ Venn diagram showing samples with multiomic data in the GREGoR Combined Consortium Dataset

## Phenotype Summary

The section below summarizes phenotype information in the participant and phenotype data tables hosted on AnVIL.

__Table 14. Summary of 'affected status' in the GREGoR Combined Consortium Dataset__

In [None]:
affected_data = summarizeData(participant, 'affected_status')
affected_data.loc['TOTAL'] = affected_data.sum()
affected_data

In [None]:
# bar chart for phenotypes by family
participant_terms = pd.merge(participant,phenotype, left_index = True, right_on = 'participant_id')
term_count = pd.DataFrame(participant_terms.groupby('term_id', dropna=False)['family_id'].count())
term_count.sort_values(by = 'family_id', ascending = False, inplace = True)

In [None]:
hpo_info = pd.read_csv('phenotype_to_genes.txt', sep = "\t")
hpo_info_sub = hpo_info[['hpo_id', 'hpo_name']]
hpo_info_sub = hpo_info_sub[~hpo_info_sub.duplicated(keep='first')]

In [None]:
term_count_name = term_count.merge(hpo_info_sub, left_index = True, right_on = 'hpo_id')

In [None]:
term_count_name.columns = ['family_count', 'hpo_id', 'hpo_name']

In [None]:
def datalabel_hbar(ax, fontsize=12):
    rects = [rect for rect in ax.get_children() if isinstance(rect, mpatches.Rectangle)]
    for rect in rects:
        width = rect.get_width()
        if width > 1:
            ax.annotate(f"{width:.0f}", xy=(width, rect.get_y() + rect.get_height() / 2),
                        xytext=(5,-1), # 5 points offset
                        textcoords="offset points", ha="left", va="center", fontsize=fontsize)

In [None]:
sns.set_theme(style="whitegrid")
sns.set_color_codes("bright")
plt.figure(figsize=(15, 12))

data = term_count_name[term_count_name['family_count'] >= 30]
data.head()
ax = sns.barplot(x="family_count", y="hpo_name", data = data, color='#388660')
datalabel_hbar(ax)
plt.show()

__Figure 8.__ The number of families per HPO name. Figure is sorted in descending order and only shows HPO names with a family count >= 30. 

## Data completeness summary

The section below provides a summary of participants with sequencing files and phenotype data

In [None]:
# do relevant subsetting
df = experimentMerge(participant, analyte, experiment_dna_short_read)
participant = readDatatable('participant', project, workspace)
participant_aligned = pd.merge(df, aligned_dna_short_read, left_index = True, right_on='experiment_dna_short_read_id')

In [None]:
probands = participant[participant['proband_relationship'] == 'Self']
other_affecteds = participant[(participant['proband_relationship'] != 'Self') & (participant['affected_status'] == 'Affected')] # other affecteds that are not probands
unaffecteds = participant[(participant['proband_relationship'] != 'Self') & (participant['affected_status'] == 'Unaffected')]
unknown = participant[(participant['proband_relationship'] != 'Self') & (participant['affected_status'] != 'Unaffected') & (participant['affected_status'] != 'Affected')]

In [None]:
proband_terms = phenotypeMerge(probands, phenotype)
other_affected_terms = phenotypeMerge(other_affecteds, phenotype)
unaffected_terms = phenotypeMerge(unaffecteds, phenotype)
unknown_terms = phenotypeMerge(unknown, phenotype)

In [None]:
seq_proband = pd.merge(probands, participant_aligned, left_index = True, right_on = 'participant_id') # sequenced probands with HPO terms
seq_affected = pd.merge(other_affecteds, participant_aligned, left_index = True, right_on = 'participant_id') # sequenced probands with HPO terms
seq_unaffected = pd.merge(unaffecteds, participant_aligned, left_index = True, right_on = 'participant_id') 
seq_unknown = pd.merge(unknown, participant_aligned, left_index = True, right_on = 'participant_id')

In [None]:
seq_proband_terms = pd.merge(proband_terms, participant_aligned, left_index = True, right_on = 'participant_id') 
seq_affected_terms = pd.merge(other_affected_terms, participant_aligned, left_index = True, right_on = 'participant_id') 
seq_unaffected_terms = pd.merge(unaffected_terms, participant_aligned, left_index = True, right_on = 'participant_id')
seq_unknown_terms = pd.merge(unknown_terms, participant_aligned, left_index = True, right_on = 'participant_id')

In [None]:
df_rna = experimentMerge(participant, analyte, experiment_rna_short_read)
participant_aligned_rna = pd.merge(df_rna, aligned_rna_short_read, left_index = True, right_on='experiment_rna_short_read_id')

seq_proband_rna = pd.merge(probands, participant_aligned_rna, left_index = True, right_on = 'participant_id') # sequenced probands with HPO terms
seq_affected_rna = pd.merge(other_affecteds, participant_aligned_rna, left_index = True, right_on = 'participant_id') # sequenced probands with HPO terms
seq_unaffected_rna = pd.merge(unaffecteds, participant_aligned_rna, left_index = True, right_on = 'participant_id') 
seq_unknown_rna = pd.merge(unknown, participant_aligned_rna, left_index = True, right_on = 'participant_id') 

seq_proband_terms_rna = pd.merge(proband_terms, participant_aligned_rna, left_index = True, right_on = 'participant_id') 
seq_affected_terms_rna = pd.merge(other_affected_terms, participant_aligned_rna, left_index = True, right_on = 'participant_id') 
seq_unaffected_terms_rna = pd.merge(unaffected_terms, participant_aligned_rna, left_index = True, right_on = 'participant_id')
seq_unknown_terms_rna = pd.merge(unknown_terms, participant_aligned_rna, left_index = True, right_on = 'participant_id')

In [None]:
df_nanopore = experimentMerge(participant, analyte, experiment_nanopore)
participant_aligned_nanopore = pd.merge(df_nanopore, aligned_nanopore, left_index = True, right_on='experiment_nanopore_id')

seq_proband_nanopore = pd.merge(probands, participant_aligned_nanopore, left_index = True, right_on = 'participant_id') # sequenced probands with HPO terms
seq_affected_nanopore = pd.merge(other_affecteds, participant_aligned_nanopore, left_index = True, right_on = 'participant_id') # sequenced probands with HPO terms
seq_unaffected_nanopore = pd.merge(unaffecteds, participant_aligned_nanopore, left_index = True, right_on = 'participant_id') 
seq_unknown_nanopore = pd.merge(unknown, participant_aligned_nanopore, left_index = True, right_on = 'participant_id') 

seq_proband_terms_nanopore = pd.merge(proband_terms, participant_aligned_nanopore, left_index = True, right_on = 'participant_id') 
seq_affected_terms_nanopore = pd.merge(other_affected_terms, participant_aligned_nanopore, left_index = True, right_on = 'participant_id') 
seq_unaffected_terms_nanopore = pd.merge(unaffected_terms, participant_aligned_nanopore, left_index = True, right_on = 'participant_id')
seq_unknown_terms_nanopore = pd.merge(unknown_terms, participant_aligned_nanopore, left_index = True, right_on = 'participant_id')

In [None]:
datacomp_series = { 'TOTAL': [len(probands), len(other_affecteds), len(unaffecteds), len(unknown)], 
                'SR_DNA' : [len(seq_proband), len(seq_affected), len(seq_unaffected), len(seq_unknown)],
                'SR_DNA_WITH_PHENOTYPE' : [len(seq_proband_terms), len(seq_affected_terms), 
                                              len(seq_unaffected_terms), len(seq_unknown)], 
                'SR_RNA' : [len(seq_proband_rna), len(seq_affected_rna), len(seq_unaffected_rna), len(seq_unknown_rna)],
                'SR_RNA_WITH_PHENOTYPE' : [len(seq_proband_terms_rna), len(seq_affected_terms_rna), 
                                              len(seq_unaffected_terms_rna), len(seq_unknown_terms_rna)], 
                'NANOPORE' : [len(seq_proband_nanopore), len(seq_affected_nanopore), len(seq_unaffected_nanopore), len(seq_unknown_nanopore)],
                'NANOPORE_WITH_PHENOTYPE' : [len(seq_proband_terms_nanopore), len(seq_affected_terms_nanopore), 
                                              len(seq_unaffected_terms_nanopore), len(seq_unknown_terms_nanopore)]
               }

__Table 15. The number of aligned files with phenotype terms for probands, affected and unaffected relatives__

In [None]:
datacomp_df = pd.DataFrame(datacomp_series)
datacomp_df.index = ['PROBANDS', 'OTHER_AFFECTED', 'UNAFFECTED', 'UNKNOWN']
datacomp_df.loc['TOTAL'] = datacomp_df.sum()

In [None]:
datacomp_df

## Summary of the genetic findings table

*All genetic finding entries in U05 are for single nucleotide variants (SNVs) or short insertions and deletions (indels).

__Table 16. Summary of the "gene known for phenotype" column in the genetic findings table__

In [None]:
genetic_summary = summarizeData(genetic_findings, 'gene_known_for_phenotype')
genetic_summary.loc['TOTAL'] = genetic_summary.sum()
genetic_summary

__Table 17. Summary of the "GREGoR_variant_classification" column in the genetic findings table__

In [None]:
variant_class = summarizeData(genetic_findings, 'GREGoR_variant_classification')
variant_class.loc['TOTAL'] = variant_class.sum()

In [None]:
variant_class