# GREGoR QC notebook
created by: DCC <br>
last edited: 07-24-23

In [None]:
# install modules
# %pip install terra-pandas

In [None]:
# import modules
import os
import io
import pandas as pd
import terra_pandas as tp
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec
import seaborn as sns
from functools import reduce

In [None]:
project = os.environ['WORKSPACE_NAMESPACE']
workspace = os.environ['WORKSPACE_NAME']
bucket = os.environ['WORKSPACE_BUCKET'] + "/" 
current_upload_cycle = 'U03'

print("Terra Billing project: " + project)
print("Workspace: " + workspace)
print("Workspace storage bucket: " + bucket)
print("GREGoR Upload Cycle: " + current_upload_cycle)

In [None]:
# functions
def readDatatable(data_table, project, workspace): 
    new_table = tp.table_to_dataframe(data_table, workspace_namespace=project, workspace=workspace)
    return new_table

def gregorUploadWorkspaces(upload_cycle): 
    upload_workspaces = ['AnVIL_GREGoR_BCM_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_BCM_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_BROAD_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_BROAD_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_CNH_I_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_CNH_I_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_GSS_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_GSS_' + upload_cycle + '_GRU',
                         'AnVIL_GREGoR_UW_CRDR_' + upload_cycle + '_HMB',
                         'AnVIL_GREGoR_UW_CRDR_' + upload_cycle + '_GRU'
                         ]
    return upload_workspaces

def combineDataTable(table, upload_workspaces):
    project = 'anvil-datastorage'    
    combined_df = pd.DataFrame([])
    for ws in upload_workspaces:
        try: 
            df = readDatatable(table, project, ws)
            print(ws)
            print(project)
            print(df.shape)
            combined_df = combined_df.append(df)
        except: 
            print(table + ' data table NOT FOUND for: ' + ws)
    return combined_df

def compareDataFrames(dfA, dfB, cycleA, cycleB):
    labels_A = dfA.index.to_list()
    labels_B = dfB.index.to_list()
    
    dfA_subset = dfA.loc[dfA.index.intersection(labels_B)]
    dfB_subset = dfB.loc[dfB.index.intersection(labels_A)]
    
    print(dfA_subset.shape)
    print(dfB_subset.shape)
    df_comp = dfA_subset.compare(dfB_subset, 
                                      align_axis = 0, keep_equal = False, keep_shape = True)
    df_comp.reset_index(inplace = True)
    df_comp['level_1'].replace('self', cycleA, inplace = True)
    df_comp['level_1'].replace('other', cycleB, inplace = True)
    return df_comp

def compareDiffOnly(dfA, dfB, cycleA, cycleB):
    labels_A = dfA.index.to_list()
    labels_B = dfB.index.to_list()
    
    dfA_subset = dfA.loc[dfA.index.intersection(labels_B)]
    dfB_subset = dfB.loc[dfB.index.intersection(labels_A)]
    
    #print(dfA_subset.shape)
    #print(dfB_subset.shape)
    df_comp = dfA_subset.compare(dfB_subset, 
                                      align_axis = 1, keep_equal = False, keep_shape = False)
    #df_comp.reset_index(inplace = True)
    df_comp.columns.set_levels([cycleA, cycleB], level=1, inplace=True)
    return df_comp

def diffSummary(df):
    df_diff = pd.DataFrame(df.count(axis = 0)/2)
    df_diff.columns = ["count"]
    df_diff['%'] = (df_diff["count"]/(len(df)/2)) * 100
    df_diff = df_diff.round(2)
    df_diff['%'].replace(100.00, '-', inplace = True)
    return df_diff
    

## Compare data tables from previous upload cycle to current upload cycle

In [None]:
# read in AnVIL tables from RC U02 upload workspaces
upload_workspaces_U02 = gregorUploadWorkspaces('U02')

participant_U02 = combineDataTable('participant', upload_workspaces_U02)
family_U02 = combineDataTable('family', upload_workspaces_U02)
phenotype_U02 = combineDataTable('phenotype', upload_workspaces_U02)
analyte_U02 = combineDataTable('analyte', upload_workspaces_U02)
experiment_dna_short_read_U02 = combineDataTable('experiment_dna_short_read', upload_workspaces_U02)
# experiment_rna_short_read_U02= combineDataTable('experiment_rna_short_read', upload_workspaces_U02)
aligned_dna_short_read_U02 = combineDataTable('aligned_dna_short_read', upload_workspaces_U02)
# aligned_rna_short_read_U02 = combineDataTable('aligned_rna_short_read', upload_workspaces_U02)

In [None]:
# read in AnVIL tables from RC U03 upload workspaces
upload_workspaces_U03 = gregorUploadWorkspaces('U03')

participant_U03 = combineDataTable('participant', upload_workspaces_U03)
family_U03 = combineDataTable('family', upload_workspaces_U03)
phenotype_U03 = combineDataTable('phenotype', upload_workspaces_U03)
analyte_U03 = combineDataTable('analyte', upload_workspaces_U03)
experiment_dna_short_read_U03 = combineDataTable('experiment_dna_short_read', upload_workspaces_U03)
experiment_rna_short_read_U03= combineDataTable('experiment_rna_short_read', upload_workspaces_U03)
aligned_dna_short_read_U03 = combineDataTable('aligned_dna_short_read', upload_workspaces_U03)
aligned_rna_short_read_U03 = combineDataTable('aligned_rna_short_read', upload_workspaces_U03)

### Compare participant tables

__How many row entries per table?__

In [None]:
print('participant_table_U02: ' + str(participant_U02.shape[0]))
print('participant_table_U03: ' + str(participant_U03.shape[0]))

In [None]:
# outer merge of participant tables
participants_merged = participant_U02.merge(participant_U03, left_index= True, right_index= True, how = 'outer', indicator = True)

__Are there row entries dropped from previous upload cycle ?__ <br>

In [None]:
merge_counts = pd.DataFrame(participants_merged['_merge'].value_counts())
merge_counts

_right_only = U03_only <br>
left_only = U02_only_ 

__How many differences per columns in the data table?__

In [None]:
participant_U03.head()
participant_comp = compareDataFrames(participant_U02, participant_U03, 'U02', 'U03')
diff_counts = diffSummary(participant_comp)
diff_nonzero_counts = diff_counts[diff_counts['count'] > 0.0 ]
diff_nonzero_counts

__Write out differences to a csv file__

In [None]:
participant_diff = compareDiffOnly(participant_U02, participant_U03, 'U02', 'U03')
participant_diff.to_csv('participant_data_table_comparison.csv', sep = ",", index = False)

### Family table comparison

__How many row entries per table?__

In [None]:
print('family_table_U02: ' + str(family_U02.shape[0]))
print('family_table_U03: ' + str(family_U03.shape[0]))

__Are there any rows dropped from the previous upload cycle?__ 

In [None]:
# outer merge of the family tables
family_merged = family_U02.merge(family_U03, left_index= True, right_index= True, how = 'outer', indicator = True)

In [None]:
merged_counts = pd.DataFrame(family_merged['_merge'].value_counts())
merged_counts

__How many differences per columns in the data table?__

In [None]:
family_comp = compareDataFrames(family_U02, family_U03, 'U02', 'U03')
diff_counts = diffSummary(family_comp)
diff_nonzero_counts = diff_counts[diff_counts['count'] > 0.0 ]  
diff_nonzero_counts

__Write out differences to csv__

In [None]:
family_diff = compareDiffOnly(family_U02, family_U03, 'U02', 'U03')
family_diff.to_csv('family_data_table_comparison.csv', sep = ",", index = False)

In [None]:
#family_diff['pedigree_file'].value_counts()

### Phenotype table comparison

__How many row entries per table?__

In [None]:
print('phenotype_table_U02: ' +  str(phenotype_U02.shape[0]))
print('phenotype_table_U02: ' +  str(phenotype_U03.shape[0]))

__Are there any row entries dropped from the previous cycle?__

In [None]:
# are there any phenotypes dropped from previous upload cycle ? 
phenotype_merged = phenotype_U02.merge(phenotype_U03, left_index= True, right_index= True, how = 'outer', indicator = True)
phenotype_merged['_merge'].value_counts()

__How many differences per columns in the data table?__

In [None]:
phenotype_comp = compareDataFrames(phenotype_U02, phenotype_U03, 'U02', 'U03')
diff_counts = diffSummary(phenotype_comp)
diff_nonzero_counts = diff_counts[diff_counts['count'] > 0.0 ]  
diff_nonzero_counts

__Write out differences to csv__

In [None]:
phenotype_diff = compareDiffOnly(phenotype_U02, phenotype_U03, 'U02', 'U03')
phenotype_diff.to_csv('phenotype_data_table_comparison.csv', sep = ",", index = False)

### Analyte table comparison

__How many row entries per table?__

In [None]:
print('analyte_table_U02: ' + str(analyte_U02.shape[0]))
print('analyte_table_U03: ' + str(analyte_U03.shape[0]))

__Are there any analytes dropped from previous upload cycle?__

In [None]:
analyte_merged = analyte_U02.merge(analyte_U03, left_index= True, right_index= True, how = 'outer', indicator = True)
analyte_merged['_merge'].value_counts()

__How many differences per columns in the data table?__

In [None]:
analyte_U03.drop(labels=['quality_issues'], inplace = True, axis = 1)

In [None]:
analyte_comp = compareDataFrames(analyte_U02, analyte_U03, 'U02', 'U03')
diff_counts = diffSummary(analyte_comp)
diff_nonzero_counts = diff_counts[diff_counts['count'] > 0.0 ]  
diff_nonzero_counts

__Write out differences to a csv__

In [None]:
# write out differences only
analyte_diff = compareDiffOnly(analyte_U02, analyte_U03, 'U02', 'U03')
analyte_diff.to_csv('analyte_data_table_comparison.csv', sep = ",", index = False)

In [None]:
pd.DataFrame(analyte_diff['primary_biosample'].value_counts())

### Experiment_dna_short_read comparison

__How many row entries per table?__

In [None]:
print('experiment_dna_table_U02: ' + str(experiment_dna_short_read_U02.shape[0]))
print('experiment_dna_table_U03: ' + str(experiment_dna_short_read_U03.shape[0]))

__Are there any row entries dropped in the previous cycle?__

In [None]:
# are there any experiment_dna_short_reads dropped from previous upload cycle ? 
experiment_dna_short_read_merged = experiment_dna_short_read_U02.merge(experiment_dna_short_read_U03, left_index= True, right_index= True, how = 'outer', indicator = True)
experiment_dna_short_read_merged['_merge'].value_counts()

In [None]:
experiment_dna_short_read_U02_only = experiment_dna_short_read_merged[experiment_dna_short_read_merged['_merge'] == 'left_only']

In [None]:
experiment_dna_short_read_U02_only

In [None]:
experiment_dna_comp = compareDataFrames(experiment_dna_short_read_U02, experiment_dna_short_read_U03, 'U02', 'U03')
diff_counts = diffSummary(experiment_dna_comp)
diff_nonzero_counts = diff_counts[diff_counts['count'] > 0.0 ]  
diff_nonzero_counts

__Write out differences only__

In [None]:
experiment_DNA_diff = compareDiffOnly(experiment_dna_short_read_U02, experiment_dna_short_read_U03, 'U02', 'U03')
experiment_DNA_diff.to_csv('experiment_DNA_data_table_comparison.csv', sep = ",", index = False)

In [None]:
pd.DataFrame(experiment_DNA_diff['analyte_id'].value_counts())

### Aligned_dna_short_read comparison

__How many row entries per table?__

In [None]:
print('aligned_dna_table_U02: ' + str(aligned_dna_short_read_U02.shape[0]))
print('aligned_dna_table_U03: ' + str(aligned_dna_short_read_U03.shape[0]))

In [None]:
aligned_dna_short_read_U02.sort_index(axis=1, inplace = True)

In [None]:
aligned_dna_short_read_U03.columns
aligned_dna_short_read_U03.drop(labels = ['quality_issues', 'reference_assembly_details', 'reference_assembly_uri'], axis = 1, inplace = True)
aligned_dna_short_read_U03.sort_index(axis = 1, inplace = True)

__Are there any aligned_dna_short_reads dropped from previous upload cycle ?__ 

In [None]:
aligned_dna_short_read_merged = aligned_dna_short_read_U02.merge(aligned_dna_short_read_U03, left_index= True, right_index= True, how = 'outer', indicator = True)
aligned_dna_short_read_merged['_merge'].value_counts()

In [None]:
aligned_dna_short_read_merged[aligned_dna_short_read_merged['_merge'] == 'left_only']

In [None]:
aligned_dna_comp = compareDataFrames(aligned_dna_short_read_U02, aligned_dna_short_read_U03, 'U02', 'U03')
aligned_dna_comp.to_csv("aligned_dna_short_read_U02_U03_compare.csv")
diff_counts = diffSummary(aligned_dna_comp)
diff_counts_nonzero = diff_counts[diff_counts['count'] > 0]
diff_counts_nonzero

__Write out differences only__

In [None]:
# write out differences only
aligned_DNA_diff = compareDiffOnly(aligned_dna_short_read_U02, aligned_dna_short_read_U03, 'U02', 'U03')
aligned_DNA_diff.to_csv('aligned_DNA_data_table_comparison.csv', sep = ",", index = False)

__Are there any duplicate file paths in the latest upload cycle?__

In [None]:
pd.DataFrame(aligned_DNA_diff['aligned_dna_short_read_file'].value_counts())

__GSS follow-up__

__Get GSS duplicates__

In [None]:
# after merging the df, get the overlapping samples that start with GSS
GSS_dups = aligned_dna_comp[aligned_dna_comp['aligned_dna_short_read_id'].str.startswith('AnVIL_GREGoR')]
#if (GSS_dups['aligned_dna_short_read_id'].duplicated().any() == True) and (GSS_dups['aligned_dna_short_read_file'].duplicated().any() != True) : 
#    print(GSS_dups['aligned_dna_short_read_id'])
GSS_dups

In [None]:
GSS_dups.to_csv('GSS_dups.csv')

In [None]:
aligned_dna_short_read_U03[aligned_dna_short_read_U03.index == 'GSS115014-01-010-SG-2']

In [None]:
pd.DataFrame(aligned_DNA_diff['md5sum'].value_counts())

In [None]:
aligned_dna_short_read_U02[aligned_dna_short_read_U02.index == 'GSS115014-01-010-SG-2']

In [None]:
BCM_U3 = readDatatable('aligned_dna_short_read', 'anvil-datastorage', 'AnVIL_GREGoR_BCM_U3_GRU')
BCM_U03 = readDatatable('aligned_dna_short_read', 'anvil-datastorage', 'AnVIL_GREGoR_BCM_U03_GRU')

In [None]:
BCM_U3.head()

In [None]:
aligned_dna_short_read_U03[aligned_dna_short_read_U03['aligned_dna_short_read_file'].duplicated()]

In [None]:
aligned_dna_short_read_U03[aligned_dna_short_read_U03['aligned_dna_short_read_index_file'].duplicated()]

In [None]:
aligned_rna_short_read_U03[aligned_rna_short_read_U03['aligned_rna_short_read_file'].duplicated()]

In [None]:
aligned_rna_short_read_U03[aligned_rna_short_read_U03['aligned_rna_short_read_index_file'].duplicated()]

__What are the differences in the aligned_dna_short_read_file columns__

In [None]:
aligned_dna_comp['aligned_dna_short_read_file'].value_counts()

In [None]:
!gsutil ls -lh 'gs://fc-secure-8ce36ffb-ad87-4942-abdc-2c0c6ce28483/BCM_2022-04-29T114240_ILWGS_GREGOR_BH10935-1_289890_1-FLOWCELL-HMNNKDSX3-HNF5NDSX3-HNGK2DSX3.hgv.cram'

In [None]:
!gsutil ls -lh 'gs://fc-secure-8ce36ffb-ad87-4942-abdc-2c0c6ce28483/BCM_2023-02-10T160013_ILWGS_BDCAPL_BH15575-1_285852_2-FLOWCELL-HLFMLDSX5-HLHGMDSX5.hgv.cram'

In [None]:
!gsutil ls -lh 'gs://fc-secure-8ce36ffb-ad87-4942-abdc-2c0c6ce28483/BCM_2023-01-24T080013_ILWGS_BDCAPL_BH15502-1_285811_2-FLOWCELL-HHNKFDSX5-HHNN3DSX5.hgv.cram'

In [None]:
deleted_crams.to_csv('bcm_deleted_aligned_sr_dna_rows.csv')

In [None]:
deleted_crams = BCM_merge[BCM_merge['_merge'] == 'left_only']

In [None]:
BCM_merge['_merge'].value_counts()

In [None]:
BCM_merge = BCM_U3.merge(BCM_U03, left_index = True, right_index = True, indicator = True, how = 'outer')

In [None]:
for i in BCM_U03['aligned_dna_short_read_file']: 
    print(i)
    break

In [None]:
participant_U03.head()

In [None]:
family_U03.head()

In [None]:
participant_family_merge = participant_U03.merge(family_U03, on = 'family_id', how = 'outer', indicator = True)

In [None]:
participant_family_merge['_merge'].value_counts()

In [None]:
participant_family_merge[participant_family_merge['_merge'] == 'right_only']

In [None]:
family_U03.index[family_U03.index.str.contains('BCM_Fam_BH14631')]

In [None]:
family_U02.index[family_U02.index.str.contains('BCM_Fam_BH14631')]

In [None]:
participant_U02.index[participant_U02['family_id'].str.contains('BCM_Fam_BH14631')]

In [None]:
participant_U03.index[participant_U03['family_id'].str.contains('BCM_Fam_BH14631')]