# GREGoR data tracking
__author: DCC__ <br>
__created: 04/17/2023__ <br>

In [None]:
# import modules
from firecloud import api as fapi
import os
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from functools import reduce
import gspread
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials
import json
import csv

In [None]:
project = os.environ['WORKSPACE_NAMESPACE']
workspace = os.environ['WORKSPACE_NAME']
bucket = os.environ['WORKSPACE_BUCKET'] + "/" 

print("Terra Billing project: " + project)
print("Workspace: " + workspace)
print("Workspace storage bucket: " + bucket)

In [None]:
# function(s) for reading in RC reporting

def readGoogleSheet(url):
    gsheet_url = url
    csv_export_url = gsheet_url.replace('/edit#gid=', '/export?format=csv&gid=')
    gsheet_df = pd.read_csv(csv_export_url)
    return gsheet_df


# function(s) for AnVIL data tables
def readDatatable(project, workspace, table): 
    new_table = pd.read_csv(io.StringIO(fapi.get_entities_tsv(project, workspace, table, 
                                                              model = "flexible").text), sep='\t')
    return new_table

def convertTuple(tup):
    str = ''.join(tup)
    return str

def formatIndex(df):
    mylist = []
    for i in df.index:
        if type(i) == tuple: 
            str = '_'.join(i)
            mylist.append(str)
        else: 
            mylist.append(i)
    df.index = mylist
    df.index = df.index.str.upper()
    return df

def formatColumns(df):
    df.columns = df.columns.str.upper()
    return df

def participantSummary(participant_data): 
    participant_data = participant_data.groupby(by=['gregor_center', 'consent_code'], as_index = True)[['entity:participant_id', 'family_id']].nunique()
    participant_data.loc["Total"] = participant_data.sum()
    participant_data.columns = ['participants', 'families']
    formatIndex(participant_data)
    formatColumns(participant_data)
    return participant_data

def familySummary(participant_data):
    family_size = pd.DataFrame(participant_data.groupby(by=['family_id'])['entity:participant_id'].count())
    return family_size

def probandrelSummary(participant_data): 
    proband_rel_data =  pd.DataFrame(participant_data.groupby(by=['proband_relationship'], 
                                                        dropna=False)['entity:participant_id'].count()) 
    proband_rel_data.columns = ['No. of participants']
    proband_rel_data.index = proband_rel_data.index.str.replace('Self', 'Proband')
    proband_rel_data.index = proband_rel_data.index.str.upper()
    proband_rel_data.index.name = None
    formatColumns(proband_rel_data)
    return proband_rel_data

def affectedSummary(participant_data): 
    affected_data =  pd.DataFrame(participant_data.groupby(by=['affected_status'], 
                                                        dropna=False)['entity:participant_id'].count())
    affected_data.columns = ['No. of participants']
    formatIndex(affected_data)
    formatColumns(affected_data)
    return affected_data

def aligneddataSummary(participant_data, analyte_data, experiment_data, aligned_data):
    participant_analyte = pd.merge(participant_data, analyte_data, left_on = 'entity:participant_id', right_on='participant_id')
    participant_analyte_experiment = pd.merge(participant_analyte, experiment_data, left_on = 'entity:analyte_id', right_on='analyte_id')
    participant_aligned_data = pd.merge(participant_analyte_experiment, aligned_data, 
                                        left_on = 'entity:experiment_dna_short_read_id', right_on = 'experiment_dna_short_read_id')
    
    return participant_aligned_data

def experimentSummary(participant_aligned_data):
    experiment_type = participant_aligned_data.groupby(by=['gregor_center','experiment_type'])[['aligned_dna_short_read_file']].nunique()
    experiment_type.columns = ['No. of experiment types']
    experiment_type.loc["Total"] = experiment_type.sum()
    formatIndex(experiment_type)
    formatColumns(experiment_type)
    return experiment_type

def probandsData(participant_data):
    probands =  participant_data[(participant_data['affected_status'] == 'Affected') & (participant_data['proband_relationship'] == 'Self')]
    return probands

def affectedData(participant_data):
    affecteds = participant_data[(participant_data['affected_status'] == 'Affected') & (participant_data['proband_relationship'] != 'Self')]
    return affecteds

def unaffectedData(participant_data):
    unaffecteds = participant_data[participant_data['affected_status'] == 'Unaffected']
    return unaffecteds

def phenotermsSummary(phenotype_data, participant_subset):
    participant_terms = pd.DataFrame(phenotype_data.groupby('participant_id')['term_id'].count())
    participant_subset_terms = participant_terms.merge(participant_subset, left_index = True, right_on = 'entity:participant_id')
    return(participant_subset_terms)

def centerPheno(participant_data, phenotype_data, center): 
    participant_terms = participant.merge(phenotype, left_on = 'entity:participant_id', right_on = 'participant_id')
    center_terms = participant_terms[participant_terms['gregor_center'] == center]
    return center_terms

def overlapPheno(participant_data, phenotype_data, centerA, centerB):
    participant_terms = participant.merge(phenotype, left_on = 'entity:participant_id', right_on = 'participant_id')
    centerA_terms = participant_terms[participant_terms['gregor_center'] == centerA]
    centerB_terms = participant_terms[participant_terms['gregor_center'] == centerB]
    overlap_terms = centerA_terms.merge(centerB_terms, on = 'term_id', how = "inner", indicator = True)
    return overlap_terms

In [None]:
# read in combined google tracking sheets
# GREGoR combined RC reporting sheets for U1
summary_report_U1 = readGoogleSheet("https://docs.google.com/spreadsheets/d/1EtOc6ggsJN6QZ4VuqCmBvUIHplMDWxshQlYnALDCHrs/edit#gid=0")

# GREGoR combined RC reporting sheets for U2
summary_report_U2 = readGoogleSheet("https://docs.google.com/spreadsheets/d/1ndQRPsJW6d8kWIq2j9JHui9-sudrtcyDoUZ_qX821X8/edit#gid=0")

In [None]:
# read in AnVIL tables
participant = readDatatable(project, workspace, 'participant')
family = readDatatable(project, workspace, 'family')
phenotype = readDatatable(project, workspace, 'phenotype')
analyte = readDatatable(project, workspace, 'analyte')
experiment_dna_short_read = readDatatable(project, workspace, 'experiment_dna_short_read')
aligned_dna_short_read = readDatatable(project, workspace, 'aligned_dna_short_read')

## GREGoR Research Center Sample Summaries

The plots below are generated from quarterly updates provided by GREGoR Research Centers. The plots below pertain to samples prepared, samples sequenced and samples uploaded to AnVIL for upload cycle 1 (U1) and upload cycle 2 (U2).

In [None]:
# subset total samples prepped
total_prep_u1 = summary_report_U1["TOTAL_SAMPLES_PREPARED"]
total_prep_u2 = summary_report_U2["TOTAL_SAMPLES_PREPARED"]

frames = (summary_report_U2['GREGoR_CENTER'], total_prep_u1, total_prep_u2)
total_prep = pd.concat(frames, axis = 1)
total_prep.columns = ['RC', 'U1', 'U2']

In [None]:
# subset total samples sequenced
total_seq_u1 = summary_report_U1["TOTAL_SAMPLES_SEQUENCING_COMPLETED"]
total_seq_u2 = summary_report_U2["TOTAL_SAMPLES_SEQUENCING_COMPLETED"]

frames = (summary_report_U2['GREGoR_CENTER'], total_seq_u1, total_seq_u2)
total_seq = pd.concat(frames, axis = 1)
total_seq.columns = ['RC', 'U1', 'U2']
#total_seq

In [None]:
# subset total samples uploaded
total_upload_u1 = summary_report_U1["TOTAL_SAMPLES_UPLOADED_TO_ANVIL"]
total_upload_u2 = summary_report_U2["TOTAL_SAMPLES_UPLOADED_TO_ANVIL"]

frames = (summary_report_U2['GREGoR_CENTER'], total_upload_u1, total_upload_u2)
total_upload = pd.concat(frames, axis = 1)
total_upload.columns = ['RC', 'U1', 'U2']

In [None]:
# plot
sns.set(style="white", font='sans-serif', font_scale=1.2)

fig, ax = plt.subplots(1,2, sharex = True, sharey = False, figsize=(14,5))
#fig.suptitle('Number of samples prepared in U1 and U2')
ax[0].set_title('Number of samples prepared in U1 and U2')
ax[1].set_title('Number of samples sequenced in U1 and U2')

sns.barplot(ax = ax[0], x='RC',  y="U2", data=total_prep, color='white', edgecolor = 'black')
sns.barplot(ax = ax[0], x="RC", y="U1", data=total_prep, estimator=sum,  color='darkgreen', edgecolor = 'black')

sns.barplot(ax = ax[1], x='RC',  y="U2", data=total_seq, color='white', edgecolor = 'black')
sns.barplot(ax = ax[1], x="RC", y="U1", data=total_seq,  color='darkgreen', edgecolor = 'black')



for ax in ax.flat:
    ax.set(xlabel='Research Center', ylabel='Number of Samples')

# legend
top_bar = mpatches.Patch(color='white', label='U2')
bottom_bar = mpatches.Patch(color='darkgreen', label='U1')
fig.legend(handles=[top_bar, bottom_bar], facecolor = "lightgray")


fig.show()

In [None]:
# plot
sns.set(style="white", font='sans-serif', font_scale=1.2)

fig, ax = plt.subplots(1, sharex = True, figsize=(7,5))

ax.set_title('Number of sequenced samples uploaded to AnVIL')

sns.barplot(ax = ax, x='RC',  y="U2", data=total_upload, color='white', edgecolor = 'black')
sns.barplot(ax = ax, x="RC", y="U1", data=total_upload, estimator=sum,  color='darkgreen', edgecolor = 'black')

plt.xlabel('Research Center')
plt.ylabel('Number of Samples')
    
fig.show()

## GREGoR Combined Dataset Summary

The GREGoR Combined Consortium dataset summaries are generated from data model tables uploaded to AnVIL workspaces. The tables and plots below summarize the GREGoR Combined dataset for U2 which ended on 3/31/2023.

#### Summary of participant and family data tables for the GREGoR combined dataset

##### Summary of participants and families

Table 1. Shows the number of participants and families by GREGoR research center and consent code. 

In [None]:
participant_data = participantSummary(participant)
participant_data

In [None]:
# plot
family_size = familySummary(participant)

sns.set(style="white", font='sans-serif', font_scale=1.2)
fig, g = plt.subplots(1, sharex = True, figsize=(7,5))

g = sns.histplot(data=family_size, discrete = True, legend=False, palette = 'RdGy',
              edgecolor="black", linewidth=2)
g.set_xticks(range(1,20))
g.xaxis.label.set_color('grey')        
g.yaxis.label.set_color('grey')        

plt.xlabel('family size', color = 'grey', style = 'italic')
plt.ylabel('sample count', color = 'grey', style = 'italic')
plt.title('Histogram of family size for U2')

plt.show()

##### Summary of participant 'proband_relationship'

Table 2. Shows the number of participants by their relationship to the proband. 

In [None]:
proband_rel_data = probandrelSummary(participant)
proband_rel_data

Pie chart summarizes the 'proband relationships' across the GREGoR Combined dataset. 

In [None]:
# plot pie char
df = proband_rel_data.loc[proband_rel_data['NO. OF PARTICIPANTS'] > 7]
df = df.replace(22, 48)
labels = df.index.str.lower()

color = sns.color_palette("Greens",5) 
plt.figure(figsize=(7,7))
plt.pie(df['NO. OF PARTICIPANTS'], labels = labels, colors = color, autopct='%.2f', labeldistance=1.25, 
        radius=1.25, 
        textprops={'fontsize': 15, 'fontname' : 'serif', 'ha' : 'center' , 'color' : 'black'},
        wedgeprops={ 'linewidth' : 1.5, 'edgecolor' : "white" })
plt.tight_layout()

#### Summary of experiment and aligned sequencing file data tables in the GREGoR combined dataset 

Table 3. The number of aligned sequencing files by GREGoR Research Center and conset code. 

In [None]:
participant_aligned_data = aligneddataSummary(participant, analyte, experiment_dna_short_read, aligned_dna_short_read)
aligned_files_by_center = participant_aligned_data.groupby(by=['gregor_center', 'consent_code'])[['aligned_dna_short_read_file']].nunique()
aligned_files_by_center.loc["Total"] = aligned_files_by_center.sum()
aligned_files_by_center.columns = ['No. of aligned files']
    
formatIndex(aligned_files_by_center)
formatColumns(aligned_files_by_center)

Table 4. The number of exomes vs genomes in the GREGoR Combined dataset. 

In [None]:
experiment_type = participant_aligned_data.groupby(by=['experiment_type'])[['aligned_dna_short_read_file']].nunique()
experiment_type.columns = ['NO. OF EXPERIMENT TYPES']
formatColumns(experiment_type)
formatIndex(experiment_type)

Table 5. The number of exomes vs genomes by GREGoR Research Center. 

In [None]:
experiment_type_by_center = experimentSummary(participant_aligned_data)
experiment_type_by_center

#### Summary of phenotype data in the GREGoR combined dataset

Table 6. Summary of 'affected status' in the GREGoR Combined dataset

In [None]:
affected_data = affectedSummary(participant)
affected_data

Phenotype terms in the GREGoR combined dataset

In [None]:
unique_phenotypes = pd.DataFrame(phenotype.groupby('term_id')['participant_id'].nunique())
participant_terms = participant.merge(phenotype, left_on = 'entity:participant_id', right_on = 'participant_id')

print('Number of phenotype terms: ' + str(len(phenotype)))
print('Number of unique phenotype terms: ' + str(len(unique_phenotypes)))
print('Number of participants with phenotype terms: ' + str(participant_terms['entity:participant_id'].nunique()))

#### Data completeness in the GREGoR combined dataset

Summary of participants with sequencing files and phenotype data

In [None]:
# do relevant subsetting
probands = probandsData(participant)
other_affecteds = affectedData(participant) # other affecteds that are not probands
unaffecteds = unaffectedData(participant)

proband_terms = phenotermsSummary(phenotype_data=phenotype, participant_subset=probands)
other_affected_terms = phenotermsSummary(phenotype_data=phenotype, participant_subset=other_affecteds)
unaffected_terms = phenotermsSummary(phenotype_data=phenotype, participant_subset=unaffecteds)

seq_proband = probands.merge(participant_aligned_data, on = 'entity:participant_id') # sequenced probands with HPO terms
seq_affected = other_affecteds.merge(participant_aligned_data, on = 'entity:participant_id') 
seq_unaffected = unaffecteds.merge(participant_aligned_data, on = 'entity:participant_id') 

seq_proband_terms = proband_terms.merge(participant_aligned_data, on = 'entity:participant_id') # sequenced probands with HPO terms
seq_affected_terms = other_affected_terms.merge(participant_aligned_data, on = 'entity:participant_id') 
seq_unaffected_terms = unaffected_terms.merge(participant_aligned_data, on = 'entity:participant_id') 

Table 7. The number of participants with 'aligned DNA short read files' and phenotype terms

In [None]:
datacomp_series = { 'TOTAL': [len(probands), len(other_affecteds), len(unaffecteds)], 
                'SEQUENCED' : [len(seq_proband), len(seq_affected), len(seq_unaffected)],
                'SEQUENCED_WITH_PHENOTYPE' : [len(seq_proband_terms), len(seq_affected_terms), 
                                              len(seq_unaffected_terms)]
               }

In [None]:
datacomp_df = pd.DataFrame(datacomp_series)
datacomp_df.index = ['PROBANDS', 'OTHER_AFFECTED', 'UNAFFECTED']
datacomp_df