# GREGoR Data Tracking and Reporting for U03
author: DCC <br>
last edited: 08/09/2023

In [None]:
# install modules
#%pip install nbconvert
#%pip install terra-pandas
#%pip install jupyter_contrib_nbextensions

In [None]:
# import modules
import os
import io
import pandas as pd
import terra_pandas as tp
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from matplotlib_venn import venn2,venn2_circles
import seaborn as sns
from functools import reduce
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
#from IPython.display import Markdown as md

In [None]:
project = os.environ['WORKSPACE_NAMESPACE']
workspace = os.environ['WORKSPACE_NAME']
bucket = os.environ['WORKSPACE_BUCKET'] + "/" 

print("Current GREGoR upload cycle: U03")
print("Terra Billing project: " + project)
print("Workspace: " + workspace)
print("Workspace storage bucket: " + bucket)

## Overview

The GREGoR Data Tracking Report provides summaries of data contributed to the GREGoR Consortium by member Research Centers (RCs). Information regarding data generation and data deposited to AnVIL are derived from the RC quarterly reports and the GREGoR Combined Consortium Workspace. Graphical and tabular summaries of participant, family, experiment, and phenotype information are generated from information provided by member Research Centers (RCs) and specified in the GREGoR data model.

In [None]:
# function(s) for reading in RC reporting

def readGoogleSheet(url):
    gsheet_url = url
    csv_export_url = gsheet_url.replace('/edit#gid=', '/export?format=csv&gid=')
    gsheet_df = pd.read_csv(csv_export_url)
    return gsheet_df

# function(s) for AnVIL data tables
def readDatatable(data_table, project, workspace): 
    new_table = tp.table_to_dataframe(data_table, workspace_namespace=project, workspace=workspace)
    return new_table

def formatIndex(df):
    mylist = []
    for i in df.index:
        if type(i) == tuple: 
            str = '_'.join(i)
            mylist.append(str)
        else: 
            mylist.append(i)
    df.index = mylist
    df.index = df.index.str.upper()
    return df

def gregorUploadWorkspaces(upload_cycle): 
    upload_workspaces = ['AnVIL_GREGoR_BCM_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_BCM_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_BROAD_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_BROAD_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_CNH_I_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_CNH_I_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_GSS_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_GSS_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_UW_CRDR_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_UW_CRDR_' + upload_cycle + '_GRU'
                         ]
    return upload_workspaces

def summarizeData(df, column):
    df['participant_id'] = df.index
    df = pd.DataFrame(df.groupby(by=[column], dropna=False)['participant_id'].count())
    df.columns = ['NO. OF PARTICIPANTS']
    formatIndex(df)
    df.columns = df.columns.str.upper()
    return df

def participantSummary(df):
    df['participant_id'] = df.index
    df = df.groupby(by=['gregor_center', 'consent_code'], as_index = True)[['participant_id', 'family_id']].nunique()
    df.loc["Total"] = df.sum()
    df.columns = ['participants', 'families']
    formatIndex(df)
    df.columns = df.columns.str.upper()
    return df

def experimentMerge(participant_df, analyte_df, experiment_df):
    df = pd.merge(participant_df, analyte_df, left_index = True, right_on='participant_id')
    df1 = pd.merge(df, experiment_df, left_index = True, right_on='analyte_id')
    return df1

def experimentSummary(participant_df, analyte_df, experiment_df):
    df1 = experimentMerge(participant_df, analyte_df, experiment_df)
    experiment_type = df1.groupby(by=['gregor_center','experiment_type'])['participant_id'].nunique().unstack()
    experiment_type = experiment_type.fillna(0).astype(int)
    experiment_type.loc["Total"] = experiment_type.sum()
    formatIndex(experiment_type)
    experiment_type.columns = experiment_type.columns.str.upper()
    experiment_type.columns.name = None
    return experiment_type

def phenotypeMerge(participant_df, phenotype):
    participant_terms = pd.DataFrame(phenotype.groupby('participant_id')['term_id'].count())
    participant_terms.index.name = None
    df = pd.merge(participant_df, participant_terms, left_index = True, right_index = True)
    return df

def combineDataTable(table, upload_workspaces):
    project = 'anvil-datastorage'
    combined_df = pd.DataFrame([])
    for ws in upload_workspaces:
        try: 
            df = readDatatable(table, project, ws)
            #print(ws)
            #print(project)
            #print(df.shape)
            combined_df = combined_df.append(df)
        except: 
            print(table + ' is missing in ' + ws)
    return combined_df

In [None]:
# GREGoR combined RC reporting sheets for U02
summary_report_U2 = readGoogleSheet("https://docs.google.com/spreadsheets/d/1ndQRPsJW6d8kWIq2j9JHui9-sudrtcyDoUZ_qX821X8/edit#gid=0")

# GREGoR combined RC reporting sheets for U03

summary_report_U3 = readGoogleSheet("https://docs.google.com/spreadsheets/d/1RPBfSwIP7Ev7gUCE4U7iG4fmIoEqCzUUxChsUNo_Pl4/edit#gid=0")

# GREGoR combined RC reporting sheet for U03 RNAseq

summary_RNAseq_U3 = readGoogleSheet("https://docs.google.com/spreadsheets/d/1Fh81ta_h7NfB_4ibUSqMjzQNvWUxqsuo08IkxyypR00/edit#gid=0")

In [None]:
# read in AnVIL tables from combined consortium U03 workspace
participant = readDatatable('participant', project, workspace)
family =readDatatable('family', project, workspace)
phenotype =readDatatable('phenotype', project, workspace)
analyte =readDatatable('analyte', project, workspace)
experiment_dna_short_read =readDatatable('experiment_dna_short_read', project, workspace)
experiment_rna_short_read =readDatatable('experiment_rna_short_read', project, workspace)
aligned_dna_short_read =readDatatable('aligned_dna_short_read', project, workspace)
aligned_rna_short_read =readDatatable('aligned_rna_short_read', project, workspace)

__Abbreviations:__<br> 
__RCs:__ BCM = Baylor College of Medicine Research Center; BROAD = Broad Institute; CNH_I = Children's National Hospital/Invitae; GSS = GREGoR Stanford Site; UW-CRDR = University of Washington Center for Rare Disease Research.
BCM = Baylor College of Medicine Research Center; BROAD = Broad Institute; CNH_I = Children's National Hospital/Invitae; GSS = GREGoR Stanford Site; UW-CRDR = University of Washington Center for Rare Disease Research; <br>
__Consent codes:__ GRU = General research use and clinical care; HMB = Health/medical/biomedical research and clinical care

## Participants and Families

The section below includes summaries of the participants and families in the GREGoR Combined Consortium Dataset. 

__Table 1. The number of participants and families in the GREGoR Combined Consortium Dataset.__ 

In [None]:
participant_data = participantSummary(participant)
participant_data.index.name = 'GREGoR_CENTER'
participant_data

In [None]:
family_size = summarizeData(participant, 'family_id')
family_size.columns = ['FAMILY SIZE']
plt.figure(figsize=(7,5))

sns.set(style="white", font='sans-serif', font_scale=1.2)

sns.histplot(data = family_size, x = 'FAMILY SIZE', color='#076839', edgecolor = 'black', discrete= True)
plt.ylabel('SAMPLE COUNT')
#plt.title('Distribution of Family Size in the GREGoR Combined Consortium Dataset')

plt.show()

__Figure 1.__ The distribution of family size in the GREGoR Combined Consortium Dataset.

__Table 2. The number of participants by their relationship to the proband.__ 

In [None]:
proband_rel_data = summarizeData(participant, 'proband_relationship')
proband_rel_data.index.name = 'PROBAND_RELATIONSHIP'
proband_rel_data.loc['TOTAL'] = proband_rel_data.sum()
proband_rel_data.index = proband_rel_data.index.str.replace('SELF', 'PROBAND')
proband_rel_data

In [None]:
# plot pie chart
proband_rel_data = summarizeData(participant, 'proband_relationship')
df = proband_rel_data.loc[proband_rel_data['NO. OF PARTICIPANTS'] > 7]
sub = proband_rel_data.loc[proband_rel_data['NO. OF PARTICIPANTS'] <= 7]
sub_sum = sub['NO. OF PARTICIPANTS'].sum()
sub_sum
s = df[df.index == 'OTHER']
s = s['NO. OF PARTICIPANTS']
val = s[0]
new_val = sub_sum + val
#new_val

In [None]:
df.index = df.index.str.replace('SELF', 'PROBAND')

In [None]:
df = df.replace(val, new_val)
#df.index = df.index.str.replace('self', 'proband')
labels = df.index.str.lower()

color = ['#076839', '#388660', '#519574', '#6aa488', '#9bc2af']

plt.figure(figsize=(7,5))
plt.pie(df['NO. OF PARTICIPANTS'], labels = labels, colors = color, autopct='%.2f', labeldistance=1.25, 
        radius=1.25, 
        textprops={'fontsize': 15, 'fontname' : 'serif', 'ha' : 'center' , 'color' : 'black'},
        wedgeprops={ 'linewidth' : 1.5, 'edgecolor' : "white" })
plt.tight_layout()
plt.plot()

__Figure 2.__ Pie chart showing the percentage of proband relationships in the GREGoR Combined Consortium Dataset. _Note: proband relationship categories were collapsed into 'other' if the count <= 10._   

__Table 3. The number of female and male participants in the GREGoR Combined Consortium Dataset.__

In [None]:
sex = summarizeData(participant, 'sex')
sex['%'] = (sex['NO. OF PARTICIPANTS']/sex['NO. OF PARTICIPANTS'].sum()).round(3)
sex

## Sample Processing and Experiment Summaries 

The section below includes RC quarterly reports summaries and experimental summaries derived from the Combined Consortium data tables hosted on AnVIL.  

In [None]:
summary_report_U2['UPLOAD_CYCLE'] = 'U02'
summary_report_U3['UPLOAD_CYCLE'] = 'U03'


In [None]:
frames = [summary_report_U2, summary_report_U3]
df = pd.concat(frames)

In [None]:
plt.figure(figsize=(14, 10))
plt.subplots_adjust(hspace=0.5)

#plt.suptitle("Summary of Research Center Sample/Sequencing Reports")
sns.set(style="white", font='sans-serif', font_scale=1.2)

rc_list = ['BCM', 'BROAD', 'CNH_I', 'GSS', 'UW_CRDR']

for n, rc in enumerate(rc_list):
    #print(rc)
    # add a new subplot iteratively
    ax = plt.subplot(3, 2, n + 1)
    df_internal = df[df['EXPERIMENT_LOC'] == 'Internal']
    
    single_rc_internal = df_internal[df_internal['GREGoR_CENTER'] == rc]
    single_rc_all = df[df['GREGoR_CENTER'] == rc]
    
    
    #print(single_rc_internal)
    single_rc_internal_sub =  single_rc_internal[['TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 
                                      'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
    single_rc_all_sub =  single_rc_all[['TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 
                                      'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
    

    single_rc_internal_sub.columns = ['PREPARED', 'SEQUENCED', 'UPLOADED TO AnVIL']
    single_rc_all_sub.columns = ['PREPARED', 'SEQUENCED', 'UPLOADED TO AnVIL']
    if len(single_rc_all_sub) == 2: 
        single_rc_all_sub.loc['SUM'] =  single_rc_all_sub.iloc[[1],[2]].sum()
    elif len(single_rc_all_sub) == 3: 
        single_rc_all_sub.loc['SUM'] =  single_rc_all_sub.iloc[[1,2],[2]].sum()
    elif len(single_rc_all_sub) == 4: 
        single_rc_all_sub.loc['SUM'] =  single_rc_all_sub.iloc[[2,3],[2]].sum()
    else: 
        print('check RC df for mistakes!!!')
    

    single_rc_internal_sub.index = ['U02_internal', 'U03_internal']
    single_rc_all_sub_sum = pd.DataFrame(single_rc_all_sub.loc['SUM'])
    single_rc_all_sub_sum.columns = ['External']
    #print(single_rc_all_sub_sum)
    
    single_rc_internal_sub_t = single_rc_internal_sub.transpose()
    #print(single_rc_internal_sub_t)
    #single_rc_all_sub_t = single_rc_all_sub.transpose()
    #print(single_rc_all_sub_t)
    
    sns.barplot(ax = ax, data = single_rc_all_sub_sum, x = single_rc_all_sub_sum.index, y = "External", estimator=sum,  color='#bc5c45', edgecolor = 'black')
    
    sns.barplot(ax = ax, data = single_rc_internal_sub_t, x =  single_rc_internal_sub_t.index, y = "U03_internal", color = "#0db53f", edgecolor = 'black')
    sns.barplot(ax = ax, data = single_rc_internal_sub_t, x = single_rc_internal_sub_t.index, y = "U02_internal", estimator=sum,  color='#076839', edgecolor = 'black')
    
    plt.title(rc)
    plt.xlabel = None
    ax.set_ylabel('NO. OF SAMPLES') 
    
    
# legend
top_bar = mpatches.Patch(color='#0db53f', label='U03_internal')
med_bar = mpatches.Patch(color='#076839', label='U02_internal')
bottom_bar = mpatches.Patch(color='#bc5c45', label='All_External')

plt.legend(handles=[top_bar, med_bar, bottom_bar], facecolor = "lightgray", loc = 'lower right', 
           bbox_to_anchor=(0.5, 0.4, 1.5, 0.6), fontsize = 'large', shadow = True)
plt.show()

__Figure 3.__ Summary of quarterly reports completed by the GREGoR RCs to track internally and externally processed short read DNA experiments. Stacked bar charts show the number of samples prepared, the number of samples that completed sequencing and the number of samples uploaded to the AnVIL platform. Each stacked bar chart shows RC sample summaries for internally processed samples in U02 (dark green), U03 (light green) and cumulative sample numbers for externally processed samples (brown).  

__Table 4. Summary of RC quarterly reports for short read DNA experiments in the current upload cycle.__

In [None]:
summary_report_U3_sub = summary_report_U3[['GREGoR_CENTER', 'EXPERIMENT_LOC','TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 
                                      'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
summary_report_U3_sub_by_center = summary_report_U3_sub.groupby(['GREGoR_CENTER', 'EXPERIMENT_LOC']).sum()
summary_report_U3_sub_by_center.loc['TOTAL'] = summary_report_U3_sub_by_center.sum()
summary_report_U3_sub_by_center.index.name = '(GREGoR_CENTER,  Internal/External)'
summary_report_U3_sub_by_center

In [None]:
plt.figure(figsize=(14, 10))
plt.subplots_adjust(hspace=0.5)

df = summary_RNAseq_U3

#plt.suptitle("Summary of Research Center Sample/Sequencing Reports")
sns.set(style="white", font='sans-serif', font_scale=1.2)

rc_list = ['BROAD', 'CNH_I', 'GSS']

for n, rc in enumerate(rc_list):
    #print(rc)
    # add a new subplot iteratively
    ax = plt.subplot(3, 2, n + 1)
    df_internal = df[df['EXPERIMENT_LOC'] == 'Internal']
    
    single_rc_internal = df_internal[df_internal['GREGoR_CENTER'] == rc]
    single_rc_all = df[df['GREGoR_CENTER'] == rc]
    
    
    #print(single_rc_internal)
    single_rc_internal_sub =  single_rc_internal[['TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 
                                      'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
    single_rc_all_sub =  single_rc_all[['TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 
                                      'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
    

    single_rc_internal_sub.columns = ['PREPARED', 'SEQUENCED', 'UPLOADED TO AnVIL']
    single_rc_all_sub.columns = ['PREPARED', 'SEQUENCED', 'UPLOADED TO AnVIL']
    single_rc_all_sub.loc['SUM'] = single_rc_all_sub.sum()   
    #print(single_rc_all_sub)

    single_rc_internal_sub.index = ['Internal']
    single_rc_all_sub_sum = pd.DataFrame(single_rc_all_sub.loc['SUM'])
    single_rc_all_sub_sum.columns = ['External']
    #print(single_rc_all_sub_sum)
    
    single_rc_internal_sub_t = single_rc_internal_sub.transpose()
    #print(single_rc_internal_sub_t)
    single_rc_all_sub_t = single_rc_all_sub.transpose()
    #print(single_rc_all_sub_t)
    
    sns.barplot(ax = ax, data = single_rc_all_sub_sum, x = single_rc_all_sub_sum.index, y = "External", estimator=sum,  color='#bc5c45', edgecolor = 'black')
    
    sns.barplot(ax = ax, data = single_rc_internal_sub_t, x =  single_rc_internal_sub_t.index, y = "Internal", color = "#076839", edgecolor = 'black')
    plt.title(rc)
    plt.xlabel = None
    ax.set_ylabel('NO. OF SAMPLES') 
    
    
# legend
top_bar = mpatches.Patch(color='#076839', label='Internal')
bottom_bar = mpatches.Patch(color='#bc5c45', label='External')

plt.legend(handles=[top_bar, bottom_bar], facecolor = "lightgray", loc = 'lower right', 
           bbox_to_anchor=(0.5, 0.4, 1.5, 0.6), fontsize = 'large', shadow = True)
plt.show()

__Figure 4.__ Summary of quarterly reports completed by the GREGoR RCs to track internally and externally processed short read RNA experiments. Bar charts show the number of samples prepared, the number of samples that completed sequencing and the number of samples uploaded to the AnVIL platform. Each bar chart shows RC sample summaries for internally processed samples (dark green) and externally processed samples (brown).  

__Table 5. Summary of RC quarterly reports for short read RNA experiments in the current upload cycle.__<br> 

In [None]:
summary_RNAseq_U3_sub = summary_RNAseq_U3[['GREGoR_CENTER', 'EXPERIMENT_LOC','TOTAL_SAMPLES_PREPARED', 'TOTAL_SAMPLES_SEQUENCING_COMPLETED', 
                                      'TOTAL_SAMPLES_UPLOADED_TO_ANVIL']]
summary_RNAseq_U3_sub_by_center = summary_RNAseq_U3_sub.groupby(['GREGoR_CENTER','EXPERIMENT_LOC']).sum()
summary_RNAseq_U3_sub_by_center.loc['TOTAL'] = summary_RNAseq_U3_sub_by_center.sum()
summary_RNAseq_U3_sub_by_center.index.name = '(GREGoR_CENTER, Internal/External)'
summary_RNAseq_U3_sub_by_center

__Table 6. The number of exomes and genomes in the  GREGoR Combined Consortium Dataset.__ <br>
_Note: These numbers are derived from the experiment_dna_short_read table._ 

In [None]:
experiment_type_by_center = experimentSummary(participant, analyte, experiment_dna_short_read)
experiment_type_by_center.index.name = 'GREGoR_CENTER'
experiment_type_by_center

__Table 7. The number of RNAseq experiments in the  GREGoR Combined Consortium Dataset.__ <br>
_Note: These numbers are derived from the experiment_rna_short_read table._ 

In [None]:
experiment_type_by_center = experimentSummary(participant, analyte, experiment_rna_short_read)
experiment_type_by_center.index.name = 'GREGoR_CENTER'
experiment_type_by_center

__Table 8. The number of participants with multiple experiment types__

In [None]:
df_dna = experimentMerge(participant, analyte, experiment_dna_short_read)
df_rna = experimentMerge(participant, analyte, experiment_rna_short_read)

In [None]:
df_exp_merge = pd.merge(df_dna, df_rna, on = 'participant_id', how = 'outer', indicator = True)
count_df = pd.DataFrame(df_exp_merge['_merge'].value_counts())
count_df.columns = ['No. OF PARTICPANTS']
count_df.index = count_df.index.str.replace('left_only', 'DNA_short_read_only')
count_df.index = count_df.index.str.replace('right_only', 'RNA_short_read_only')
count_df.index = count_df.index.str.replace('both', 'DNA and RNA short read')
count_df.index.name = 'EXPERIMENT TYPE'
count_df

In [None]:
# plot venn
plt.figure(figsize=(7,5))
venn = venn2(subsets = (2266, 5, 187), set_labels = ('DNA_SHORT_READ', 'RNA_SHORT_READ'), set_colors=("#076839", "white"), alpha=0.7)
venn2_circles(subsets=(2266, 5, 187), linestyle="dashed", linewidth=2)

for text in venn.set_labels:
    text.set_fontsize(15)

for text in venn.subset_labels:
    text.set_fontsize(18)

plt.show()

__Figure 5:__ Venn diagram showing the number of participants with only DNA short read experiments (N=2266), the number of participants with only RNA short read experiments (N=5) and number of participants with both (N=187). 

__Table 9. The number of aligned DNA sequencing files (i.e. BAMS or CRAMs) in the GREGoR Combined Consortium Dataset.__ <br>
_Note: These numbers are derived from the 'aligned_dna_short_read' data table._

In [None]:
df = experimentMerge(participant, analyte, experiment_dna_short_read)
participant_aligned = pd.merge(df, aligned_dna_short_read, left_index = True, right_on='experiment_dna_short_read_id')
aligned_files_by_center = participant_aligned.groupby(by=['gregor_center'])[['aligned_dna_short_read_file']].nunique()
aligned_files_by_center.loc["Total"] = aligned_files_by_center.sum()
aligned_files_by_center.columns = ['No. of short read DNA files']
    
formatIndex(aligned_files_by_center)
aligned_files_by_center.columns = aligned_files_by_center.columns.str.upper()
aligned_files_by_center.index.name = 'GREGoR_CENTER'
aligned_files_by_center

 __Table 10. The number of aligned RNA sequencing files (i.e. BAMS or CRAMs) in the GREGoR Combined Consortium Dataset.__ <br> 
_Note: These numbers are derived from the 'aligned_rna_short_read' data table._

In [None]:
df = experimentMerge(participant, analyte, experiment_rna_short_read)
participant_aligned = pd.merge(df, aligned_rna_short_read, left_index = True, right_on='experiment_rna_short_read_id')
aligned_files_by_center = participant_aligned.groupby(by=['gregor_center'])[['aligned_rna_short_read_file']].nunique()
aligned_files_by_center.loc["Total"] = aligned_files_by_center.sum()
aligned_files_by_center.columns = ['No. of short read RNA files']
    
formatIndex(aligned_files_by_center)
aligned_files_by_center.columns = aligned_files_by_center.columns.str.upper()
aligned_files_by_center.index.name = 'GREGoR_CENTER'
aligned_files_by_center

## Phenotypes in the GREGoR Combined Consortium Dataset

The section below summarizes phenotype information in the participant and phenotype Combined Consortium data tables hosted on AnVIL.  

__Table 11. Summary of 'affected status' in the GREGoR Combined Consortium Dataset__

In [None]:
affected_data = summarizeData(participant, 'affected_status')
affected_data.loc['TOTAL'] = affected_data.sum()
affected_data

__Phenotype summary__

In [None]:
unique_phenotypes = phenotype[~phenotype['term_id'].duplicated()]
participant_terms = pd.merge(participant,phenotype, left_index = True, right_on = 'participant_id')

#unique_phenotypes
#print('Number of phenotype terms: ' + str(len(phenotype)))
#print('Number of unique phenotype terms: ' + str(len(unique_phenotypes)))
#print('Number of participants with phenotype terms: ' + str(participant_terms['participant_id'].nunique()))


Number of phenotype terms: 5806 <br>
Number of unique phenotype terms: 1457 <br>
Number of participants with phenotype terms: 1274 <br>

__Table 12. The most common phenotype terms in the GREGoR Combined Consortium Dataset__

In [None]:
term_count = pd.DataFrame(phenotype.groupby('term_id', dropna=False)['participant_id'].count())
term_count_sorted = term_count.sort_values('participant_id', ascending=False)
term_count_sorted["term"] = term_count_sorted.index
term_count_sorted.reset_index(drop=True, inplace = True)
term_name = ['Global developmental delay', 'Seizure','Hypotonia', 'Intellectual Disability',
             'Muscle weakness', 'Morphological central nervous system abnormality','Bicuspid aortic valve', 
            'Thoracic aortic aneurysm','Microcephaly', 'Abnormal cerebral cortex morphology']
term_name = pd.DataFrame(term_name)

term_count_sorted_top10 = term_count_sorted[:10]
frames = [term_count_sorted_top10, term_name]
term_count_sorted_top10 = pd.concat(frames, axis = 1)
term_count_sorted_top10 = term_count_sorted_top10[['term', 0, 'participant_id']]

term_count_sorted_top10.columns = ['term_id', 'term_name', 'no. of participants']
term_count_sorted_top10.columns = term_count_sorted_top10.columns.str.upper()
term_count_sorted_top10
term_count_sorted_top10.reset_index(drop=True, inplace = True)

In [None]:
term_count_sorted_top10

## "Data completeness" in the GREGoR Combined Consortium Dataset

The section below provides a summary of participants with sequencing files and phenotype data

In [None]:
# do relevant subsetting
df = experimentMerge(participant, analyte, experiment_dna_short_read)
participant_aligned = pd.merge(df, aligned_dna_short_read, left_index = True, right_on='experiment_dna_short_read_id')

probands = participant[participant['proband_relationship'] == 'Self']
other_affecteds = participant[(participant['proband_relationship'] != 'Self') & (participant['affected_status'] == 'Affected')] # other affecteds that are not probands
unaffecteds = participant[(participant['proband_relationship'] != 'Self') & (participant['affected_status'] == 'Unaffected')]

proband_terms = phenotypeMerge(probands, phenotype)
other_affected_terms = phenotypeMerge(other_affecteds, phenotype)
unaffected_terms = phenotypeMerge(unaffecteds, phenotype)

seq_proband = pd.merge(probands, participant_aligned, left_index = True, right_on = 'participant_id') # sequenced probands with HPO terms
seq_affected = pd.merge(other_affecteds, participant_aligned, left_index = True, right_on = 'participant_id') # sequenced probands with HPO terms
seq_unaffected = pd.merge(unaffecteds, participant_aligned, left_index = True, right_on = 'participant_id') 


seq_proband_terms = pd.merge(proband_terms, participant_aligned, left_on = 'participant_id', right_on = 'participant_id') 
seq_affected_terms = pd.merge(other_affected_terms, participant_aligned, left_index = True, right_on = 'participant_id') 
seq_unaffected_terms = pd.merge(unaffected_terms, participant_aligned, left_index = True, right_on = 'participant_id')

In [None]:
datacomp_series = { 'TOTAL': [len(probands), len(other_affecteds), len(unaffecteds)], 
                'SEQUENCED' : [len(seq_proband), len(seq_affected), len(seq_unaffected)],
                'SEQUENCED_WITH_PHENOTYPE' : [len(seq_proband_terms), len(seq_affected_terms), 
                                              len(seq_unaffected_terms)]
               }

__Table 13. The number of 'aligned DNA short read files' and phenotype terms for probands, affected and unaffected relatives__



In [None]:
datacomp_df = pd.DataFrame(datacomp_series)
datacomp_df.index = ['PROBANDS', 'OTHER_AFFECTED', 'UNAFFECTED']
datacomp_df
