### GREGoR data tracking
__author: DCC__ <br>
__created: 04/17/2023__ <br>

In [None]:
from firecloud import api as fapi
import os
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce

In [None]:
project = os.environ['WORKSPACE_NAMESPACE']
workspace = os.environ['WORKSPACE_NAME']
bucket = os.environ['WORKSPACE_BUCKET'] + "/" 

print("Terra Billing project: " + project)
print("Workspace: " + workspace)
print("Workspace storage bucket: " + bucket)

In [None]:
# functions
def readDatatable(project, workspace, table): 
    new_table = pd.read_csv(io.StringIO(fapi.get_entities_tsv(project, workspace, table, 
                                                              model = "flexible").text), sep='\t')
    return new_table

def convertTuple(tup):
    str = ''.join(tup)
    return str

def formatIndex(df):
    mylist = []
    for i in df.index:
        if type(i) == tuple: 
            str = '_'.join(i)
            mylist.append(str)
        else: 
            mylist.append(i)
    df.index = mylist
    df.index = df.index.str.upper()
    return df

def formatColumns(df):
    df.columns = df.columns.str.upper()
    return df

def participantSummary(participant_data): 
    participant_data = participant_data.groupby(by=['gregor_center', 'consent_code'], as_index = True)[['entity:participant_id', 'family_id']].nunique()
    participant_data.loc["Total"] = participant_data.sum()
    participant_data.columns = ['participants', 'families']
    formatIndex(participant_data)
    formatColumns(participant_data)
    return participant_data

def familySummary(participant_data):
    family_size = pd.DataFrame(participant_data.groupby(by=['family_id'])['entity:participant_id'].count())
    return family_size

def probandrelSummary(participant_data): 
    proband_rel_data =  pd.DataFrame(participant_data.groupby(by=['proband_relationship'], 
                                                        dropna=False)['entity:participant_id'].count()) 
    proband_rel_data.columns = ['No. of participants']
    proband_rel_data.index = proband_rel_data.index.str.replace('Self', 'Proband')
    proband_rel_data.index = proband_rel_data.index.str.upper()
    proband_rel_data.index.name = None
    formatColumns(proband_rel_data)
    return proband_rel_data

def affectedSummary(participant_data): 
    affected_data =  pd.DataFrame(participant_data.groupby(by=['affected_status'], 
                                                        dropna=False)['entity:participant_id'].count())
    affected_data.columns = ['No. of participants']
    formatIndex(affected_data)
    formatColumns(affected_data)
    return affected_data

def aligneddataSummary(participant_data, analyte_data, experiment_data, aligned_data):
    participant_analyte = pd.merge(participant_data, analyte_data, left_on = 'entity:participant_id', right_on='participant_id')
    participant_analyte_experiment = pd.merge(participant_analyte, experiment_data, left_on = 'entity:analyte_id', right_on='analyte_id')
    participant_aligned_data = pd.merge(participant_analyte_experiment, aligned_data, 
                                        left_on = 'entity:experiment_dna_short_read_id', right_on = 'experiment_dna_short_read_id')
    
    return participant_aligned_data

def experimentSummary(participant_aligned_data):
    experiment_type = participant_aligned_data.groupby(by=['gregor_center','experiment_type'])[['aligned_dna_short_read_file']].nunique()
    experiment_type.columns = ['No. of experiment types']
    experiment_type.loc["Total"] = experiment_type.sum()
    formatIndex(experiment_type)
    formatColumns(experiment_type)
    return experiment_type

def probandsData(participant_data):
    probands =  participant_data[(participant_data['affected_status'] == 'Affected') & (participant_data['proband_relationship'] == 'Self')]
    return probands

def affectedData(participant_data):
    affecteds = participant_data[(participant_data['affected_status'] == 'Affected') & (participant_data['proband_relationship'] != 'Self')]
    return affecteds

def unaffectedData(participant_data):
    unaffecteds = participant_data[participant_data['affected_status'] == 'Unaffected']
    return unaffecteds

def phenotermsSummary(phenotype_data, participant_subset):
    participant_terms = pd.DataFrame(phenotype_data.groupby('participant_id')['term_id'].count())
    participant_subset_terms = participant_terms.merge(participant_subset, left_index = True, right_on = 'entity:participant_id')
    return(participant_subset_terms)

def getLength(df):
    len_df = len(df)
    return len_df

def centerPheno(participant_data, phenotype_data, center): 
    participant_terms = participant.merge(phenotype, left_on = 'entity:participant_id', right_on = 'participant_id')
    center_terms = participant_terms[participant_terms['gregor_center'] == center]
    return center_terms


def overlapPheno(participant_data, phenotype_data, centerA, centerB):
    participant_terms = participant.merge(phenotype, left_on = 'entity:participant_id', right_on = 'participant_id')
    centerA_terms = participant_terms[participant_terms['gregor_center'] == centerA]
    centerB_terms = participant_terms[participant_terms['gregor_center'] == centerB]
    overlap_terms = centerA_terms.merge(centerB_terms, on = 'term_id', how = "inner", indicator = True)
    return overlap_terms

In [None]:
# read in data tables
participant = readDatatable(project, workspace, 'participant')
family = readDatatable(project, workspace, 'family')
phenotype = readDatatable(project, workspace, 'phenotype')
analyte = readDatatable(project, workspace, 'analyte')
experiment_dna_short_read = readDatatable(project, workspace, 'experiment_dna_short_read')
aligned_dna_short_read = readDatatable(project, workspace, 'aligned_dna_short_read')

#### Summary of participant and family data tables for the GREGoR combined dataset
##### Summary of participants and families 

In [None]:
participant_data = participantSummary(participant)
participant_data

##### Summary of family size

_Distribution of family size in the GREGoR combined dataset_

In [None]:
family_size = familySummary(participant)
g=sns.histplot(data=family_size, discrete = True, palette = 'Paired', legend=False)
g.set_xticks(range(1,20))
plt.xlabel('family size')
plt.ylabel('count')
plt.show()

##### Summary of participant 'proband_relationship'

In [None]:
proband_rel_data = probandrelSummary(participant)
proband_rel_data

In [None]:
df = proband_rel_data.loc[proband_rel_data['NO. OF PARTICIPANTS'] > 7]
df = df.replace(22, 48)

In [None]:
labels = df.index.str.lower()
color = sns.color_palette("Paired", 4) 
plt.figure(figsize=(5,5))
plt.pie(df['NO. OF PARTICIPANTS'], labels = labels, colors = color, labeldistance=1.25, 
        radius=1.25, 
        textprops={'fontsize': 15, 'fontname' : 'serif', 'ha' : 'center'},
        wedgeprops={ 'linewidth' : 1.5, 'edgecolor' : "white" })
plt.tight_layout()

##### Summary of participant 'affected_status'

In [None]:
affected_data = affectedSummary(participant)
affected_data

#### Summary of experiment and aligned sequencing file data tables in the GREGoR combined dataset 

##### Number of aligned sequencing files 

In [None]:
participant_aligned_data = aligneddataSummary(participant, analyte, experiment_dna_short_read, aligned_dna_short_read)
aligned_files_by_center = participant_aligned_data.groupby(by=['gregor_center', 'consent_code'])[['aligned_dna_short_read_file']].nunique()
aligned_files_by_center.loc["Total"] = aligned_files_by_center.sum()
aligned_files_by_center.columns = ['No. of aligned files']
    
formatIndex(aligned_files_by_center)
formatColumns(aligned_files_by_center)

##### Number of whole genomes vs whole exome sequencing

In [None]:
experiment_type = participant_aligned_data.groupby(by=['experiment_type'])[['aligned_dna_short_read_file']].nunique()
experiment_type.columns = ['NO. OF EXPERIMENT TYPES']
#experiment_type.loc['Total'] = experiment_type.sum()
formatColumns(experiment_type)
formatIndex(experiment_type)

In [None]:
labels = experiment_type.index.str.lower()
color = sns.color_palette("Paired", 4) 
fig, ax = plt.subplots(figsize=(4, 4))

patches, texts, pcts = ax.pie(experiment_type['NO. OF EXPERIMENT TYPES'], labels = labels,
        colors = color, labeldistance=1.2, radius=1.2, autopct='%1.1f%%', textprops={'fontsize': 14, 'ha' : 'center'},
        wedgeprops={ 'linewidth' : 3, 'edgecolor' : "white" })
plt.setp(pcts, color='black', fontweight=400)
plt.setp(texts, color = "black", fontweight=500)

plt.tight_layout()

_Experiment type by GREGoR Center_

In [None]:
experiment_type_by_center = experimentSummary(participant_aligned_data)
experiment_type_by_center

#### Summary of phenotype data in the GREGoR combined dataset

In [None]:
unique_phenotypes = pd.DataFrame(phenotype.groupby('term_id')['participant_id'].nunique())
participant_terms = participant.merge(phenotype, left_on = 'entity:participant_id', right_on = 'participant_id')

print('Number of phenotype terms: ' + str(len(phenotype)))
print('Number of unique phenotype terms: ' + str(len(unique_phenotypes)))
print('Number of participants with phenotype terms: ' + str(participant_terms['entity:participant_id'].nunique()))

_Unique HPO terms overlapping across research centers_ 

In [None]:
centers = ['BCM', 'BROAD', 'CNH_I', 'GSS', 'UW_CRDR']

BCM = []
BROAD = []
CNH_I = []
GSS = []
UW_CRDR = []

for i in centers:
    overlap = overlapPheno(participant, phenotype, 'BCM', i)
    num_overlap = overlap['term_id'].nunique()
    BCM.append(num_overlap)
    
    overlap = overlapPheno(participant, phenotype, 'BROAD', i)
    num_overlap = overlap['term_id'].nunique()
    BROAD.append(num_overlap)
    
    overlap = overlapPheno(participant, phenotype, 'CNH_I', i)
    num_overlap = overlap['term_id'].nunique()
    CNH_I.append(num_overlap)
    
    overlap = overlapPheno(participant, phenotype, 'GSS', i)
    num_overlap = overlap['term_id'].nunique()
    GSS.append(num_overlap)
    
    overlap = overlapPheno(participant, phenotype, 'UW_CRDR', i)
    num_overlap = overlap['term_id'].nunique()
    UW_CRDR.append(num_overlap)

center_list = [BCM, BROAD, CNH_I, GSS, UW_CRDR]
center_df = pd.DataFrame(center_list)
center_df.columns = ['BCM', 'BROAD', 'CNH_I', 'GSS', 'UW_CRDR']
center_df.index = ['BCM', 'BROAD', 'CNH_I', 'GSS', 'UW_CRDR']

center_df.replace([395,274, 509, 374, 80], '-')

#### Summary of 'data completeness' in the GREGoR combined dataset

In [None]:
probands = probandsData(participant)
other_affecteds = affectedData(participant) # other affecteds that are not probands
unaffecteds = unaffectedData(participant)

In [None]:
proband_terms = phenotermsSummary(phenotype_data=phenotype, participant_subset=probands)
other_affected_terms = phenotermsSummary(phenotype_data=phenotype, participant_subset=other_affecteds)
unaffected_terms = phenotermsSummary(phenotype_data=phenotype, participant_subset=unaffecteds)

In [None]:
seq_proband = probands.merge(participant_aligned_data, on = 'entity:participant_id') # sequenced probands with HPO terms
seq_affected = other_affecteds.merge(participant_aligned_data, on = 'entity:participant_id') 
seq_unaffected = unaffecteds.merge(participant_aligned_data, on = 'entity:participant_id') 

In [None]:
seq_proband_terms = proband_terms.merge(participant_aligned_data, on = 'entity:participant_id') # sequenced probands with HPO terms
seq_affected_terms = other_affected_terms.merge(participant_aligned_data, on = 'entity:participant_id') 
seq_unaffected_terms = unaffected_terms.merge(participant_aligned_data, on = 'entity:participant_id') 

##### Participants with sequencing files and phenotype data

_The number of participants with 'aligned DNA short read files' and phenotype terms_

In [None]:
datacomp_series = { 'TOTAL': [getLength(probands), getLength(other_affecteds), getLength(unaffecteds)], 
                'SEQUENCED' : [getLength(seq_proband), getLength(seq_affected), getLength(seq_unaffected)],
                'SEQUENCED_WITH_PHENOTYPE' : [getLength(seq_proband_terms), getLength(seq_affected_terms), 
                                              getLength(seq_unaffected_terms)]
               }

In [None]:
datacomp_df = pd.DataFrame(datacomp_series)
datacomp_df.index = ['PROBANDS', 'OTHER_AFFECTED', 'UNAFFECTED']
datacomp_df