<a href="https://colab.research.google.com/github/VJalili/denovo/blob/main/denovo_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !gcloud init

In [16]:
import os
import pandas as pd
import gzip

In [2]:
from google.cloud import storage
storage_client = storage.Client(project="broad-dsde-methods")

# This method uses the Google cloud storage library which performs faster
# calling gsutil using `check_output`.
def download_blob(bucket_name, blob_name, local_filename):
  bucket = storage_client.get_bucket(bucket_name)
  blob = bucket.blob(blob_name)
  blob.download_to_filename(local_filename)

def copy_object_to_bucket(bucket_name, blob_name, source_filename):
  bucket = storage_client.get_bucket(bucket_name)
  blob = bucket.blob(blob_name)
  blob.upload_from_filename(source_filename)

In [8]:
working_dir = os.path.join(".", "denovo")
if not os.path.exists(working_dir):
  os.makedirs(working_dir)

In [14]:
def resolve_filename(filename):
  local_filename = os.path.join(working_dir, filename)
  if not os.path.isfile(local_filename):
    download_blob("broad-dsde-methods-vj", f"denovo-sv/{filename}", local_filename)
    print(f"Localized {filename}")
  else:
    print(f"File is already localized, filename: {local_filename}")
  return local_filename


pre_denovo_vcf_filename = resolve_filename("OFC.shard_000000.vcf.gz")
pre_denovo_bed_filename = resolve_filename("OFC.shard_000000.bed.gz")
post_denovo_annotations_filename = resolve_filename("OFC.shard_000000.annotation.bed.gz")
post_denovo_denovo_only_vars_filename = resolve_filename("OFC.shard_000000.denovo.bed.gz")
flipbook_input_txt_filename = resolve_filename("flipbook-input-example.txt")
flipbook_output_tsv_filename = resolve_filename("flipbook-output-example.tsv")

File is already localized, filename: ./denovo/OFC.shard_000000.vcf.gz
File is already localized, filename: ./denovo/OFC.shard_000000.bed.gz
File is already localized, filename: ./denovo/OFC.shard_000000.annotation.bed.gz
File is already localized, filename: ./denovo/OFC.shard_000000.denovo.bed.gz
File is already localized, filename: ./denovo/flipbook-input-example.txt
File is already localized, filename: ./denovo/flipbook-output-example.tsv


In [23]:
with gzip.open(pre_denovo_bed_filename, "rb") as f:
  pre_denovo_df = pd.read_csv(f, sep="\t")
pre_denovo_df.shape

(4000, 90)

In [24]:
with gzip.open(post_denovo_annotations_filename, "rb") as f:
  post_denovo_annotated_df = pd.read_csv(f, sep="\t")
post_denovo_annotated_df.shape

  post_denovo_annotated_df = pd.read_csv(f, sep="\t")


(134817, 129)

In [25]:
with gzip.open(post_denovo_denovo_only_vars_filename, "rb") as f:
  post_denovo_filtered_df = pd.read_csv(f, sep="\t")
post_denovo_filtered_df.shape

(271, 129)

In [28]:
added_column_names = set(post_denovo_filtered_df.columns) - set(pre_denovo_df.columns)
print(len(added_column_names))
print(added_column_names)

40
{'GT', 'paternal_srgq', 'PE_GT', 'PE_GQ', 'in_gd', 'maternal_srgq', 'keep_gq', 'maternal_gq', 'is_cpx', 'num_parents_family', 'chrom', 'EVIDENCE_FIX', 'paternal_rdcn', 'GQ', 'RD_GQ', 'chrom_type_family', 'name_famid', 'is_depth_only', 'RD_CN', 'paternal_pegq', 'maternal_pegq', 'filter_flag', 'AF_parents', 'is_small_cnv', 'EV', 'paternal_gq', 'SR_GT', 'family_id', 'chrom_type_sample', 'num_children', 'maternal_rdcn', 'SR_GQ', 'overlap_parent', 'contains_RD', 'is_de_novo', 'is_depth_only_small', 'is_duplicated', 'sample', 'num_parents', 'is_large_cnv'}


In [30]:
flipbook_input_df = pd.read_csv(flipbook_input_txt_filename, sep="\t")
flipbook_input_df.shape

(58, 145)

In [31]:
flipbook_output_df = pd.read_csv(flipbook_output_tsv_filename, sep="\t")
flipbook_output_df.shape

(2134, 4)

In [34]:
flipbook_input_df.columns

Index(['Path', 'chrom', 'start', 'end', 'name', 'svtype', 'sample', 'samples',
       'AC', 'ALGORITHMS',
       ...
       'result', 'Reason_unsure_follow_up', 'Notes', 'reviewers',
       'tiebreaker_reviewed', 'num_reviewers', 'tiebreaker_reviewer',
       'tiebreaker_result', 'SV_plots_reviewed', 'result_final'],
      dtype='object', length=145)


In [35]:
x = set(flipbook_input_df.columns) - set(post_denovo_filtered_df.columns)
print(len(x))

16


In [36]:
print(x)

{'tiebreaker_reviewed', 'Notes', 'Reason_unsure_follow_up', 'reviewers', 'batch', 'num_reviewers', 'sample_name', 'tiebreaker_reviewer', 'tiebreaker_result', 'ID', 'SV_plots_reviewed', 'result_final', 'result', 'cohort', 'Path', 'median_coverage'}


In [37]:
flipbook_output_df.columns

Index(['Path', 'Is de novo', 'Reason Unsure - Follow up', 'Notes'], dtype='object')

In [38]:
print(set(post_denovo_annotated_df.columns))

{'end', 'CHR2', 'MALE_FREQ_HEMIALT', 'PE_GT', 'CN_FREQ', 'EVIDENCE', 'PREDICTED_MSV_EXON_OVERLAP', 'maternal_gq', 'MALE_FREQ_HEMIREF', 'CPX_INTERVALS', 'MALE_CN_FREQ', 'FEMALE_AF', 'paternal_rdcn', 'GQ', 'MALE_FREQ_HET', 'name_famid', 'RD_GQ', 'PREDICTED_TSS_DUP', 'FREQ_HOMREF', 'FEMALE_N_HOMALT', 'FEMALE_CN_NUMBER', 'PREDICTED_BREAKEND_EXONIC', 'is_small_cnv', 'SR_GT', 'MALE_CN_NONREF_COUNT', 'overlap_parent', 'FEMALE_CN_NONREF_COUNT', 'PREDICTED_NEAREST_TSS', 'PREDICTED_PROMOTER', 'MALE_CN_NONREF_FREQ', 'FEMALE_FREQ_HOMALT', 'is_large_cnv', 'GT', 'MALE_N_HOMALT', 'FEMALE_CN_COUNT', 'PREDICTED_INV_SPAN', 'UNRESOLVED_TYPE', 'MALE_CN_COUNT', 'PE_GQ', 'in_gd', 'PREDICTED_DUP_PARTIAL', 'gnomad_v2.1_sv_EUR_AF', 'END2', 'N_HET', 'PREDICTED_INTRAGENIC_EXON_DUP', 'N_HOMALT', 'FEMALE_N_HOMREF', 'num_parents_family', 'CN_COUNT', 'MALE_CN_NUMBER', 'MALE_AC', 'AF', 'chrom_type_family', 'RD_CN', 'maternal_pegq', 'FEMALE_N_BI_GENOS', 'gnomad_v2.1_sv_AFR_AF', 'AN', 'filter_flag', 'AF_parents', 'PRED

In [43]:
# Q: how do you match between the input and output of flipbook? the following should not have 0 rows, instead the number of the rows should be equal to the flipbook input and output.
labeled_df = pd.merge(
    flipbook_input_df, flipbook_output_df, on="Path",
    how="inner")  # Will contain rows that have matching values in both dataframes.
labeled_df.shape

(0, 148)

In [44]:
labeled_df.columns

Index(['Path', 'chrom', 'start', 'end', 'name', 'svtype', 'sample', 'samples',
       'AC', 'ALGORITHMS',
       ...
       'reviewers', 'tiebreaker_reviewed', 'num_reviewers',
       'tiebreaker_reviewer', 'tiebreaker_result', 'SV_plots_reviewed',
       'result_final', 'Is de novo', 'Reason Unsure - Follow up', 'Notes_y'],
      dtype='object', length=148)

In [47]:
final_df = pd.merge(post_denovo_annotated_df, labeled_df, on=["chrom", "start", "end"])
print(final_df.shape)

(0, 274)
