<a href="https://colab.research.google.com/github/VJalili/denovo/blob/main/denovo_filtering_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !gcloud init

In [2]:
# !gcloud auth application-default login

In [3]:
# !pip install pysam

In [4]:
import math
import os
import numpy as np
import pandas as pd
import gzip

from dataclasses import dataclass, field
from google.cloud import storage
from pathlib import Path
from typing import Callable, Any, Dict, Tuple, List
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.impute import SimpleImputer

from pysam import VariantFile
from tqdm import tqdm
import copy

In [5]:
storage_client = storage.Client(project="broad-dsde-methods")

# This method uses the Google cloud storage library which performs faster
# calling gsutil using `check_output`.
def download_blob(bucket_name, blob_name, local_filename):
  bucket = storage_client.get_bucket(bucket_name)
  blob = bucket.blob(blob_name)
  blob.download_to_filename(local_filename)

def copy_object_to_bucket(bucket_name, blob_name, source_filename):
  bucket = storage_client.get_bucket(bucket_name)
  blob = bucket.blob(blob_name)
  blob.upload_from_filename(source_filename)

In [6]:
working_dir = os.path.join(".", "denovo")
if not os.path.exists(working_dir):
  os.makedirs(working_dir)

In [7]:
skip_retrieving_missing_measurements = False

In [8]:
bucket_name = "broad-dsde-methods-vj"
base_blob_name = "denovo-sv"
def resolve_filename(filename):
  local_filename = os.path.join(working_dir, filename)
  if not os.path.isfile(local_filename):
    download_blob(bucket_name, f"{base_blob_name}/{filename}", local_filename)
    print(f"Localized {filename}")
  else:
    print(f"File is already localized, filename: {local_filename}")
  return local_filename

filename = "GMKF-OFC-GREGoR-denovo-SV-Master-20240918.txt"
denovo_svs_filename = resolve_filename(filename)

File is already localized, filename: ./denovo/GMKF-OFC-GREGoR-denovo-SV-Master-20240918.txt


In [9]:
vcf_files = ["mg_batch20.annotated.vcf.gz",
             "m4_batch03.annotated.vcf.gz",
             "m2_batch11.annotated.vcf.gz",
             "m2_batch10.annotated.vcf.gz",
             "mg_batch13.annotated.vcf.gz",
             "mg_batch03.annotated.vcf.gz",
             "mg_batch05.annotated.vcf.gz",
             "mg_batch17.annotated.vcf.gz",
             "mg_batch08.annotated.vcf.gz",
             "mg_batch07.annotated.vcf.gz",
             "second_run_cp_cohort.annotated.vcf.gz",
             "phase4_all_batches.annotated.vcf.gz"]
vcf_filenames = []
for vcf_file in vcf_files:
  vcf_filenames.append(resolve_filename(vcf_file))
  resolve_filename(vcf_file + ".tbi")

File is already localized, filename: ./denovo/mg_batch20.annotated.vcf.gz
File is already localized, filename: ./denovo/mg_batch20.annotated.vcf.gz.tbi
File is already localized, filename: ./denovo/m4_batch03.annotated.vcf.gz
File is already localized, filename: ./denovo/m4_batch03.annotated.vcf.gz.tbi
File is already localized, filename: ./denovo/m2_batch11.annotated.vcf.gz
File is already localized, filename: ./denovo/m2_batch11.annotated.vcf.gz.tbi
File is already localized, filename: ./denovo/m2_batch10.annotated.vcf.gz
File is already localized, filename: ./denovo/m2_batch10.annotated.vcf.gz.tbi
File is already localized, filename: ./denovo/mg_batch13.annotated.vcf.gz
File is already localized, filename: ./denovo/mg_batch13.annotated.vcf.gz.tbi
File is already localized, filename: ./denovo/mg_batch03.annotated.vcf.gz
File is already localized, filename: ./denovo/mg_batch03.annotated.vcf.gz.tbi
File is already localized, filename: ./denovo/mg_batch05.annotated.vcf.gz
File is alread

In [10]:
df = pd.read_csv(denovo_svs_filename, sep="\t")
df.shape

  df = pd.read_csv(denovo_svs_filename, sep="\t")


(29456, 160)

In [11]:
vcfs = []
for vcf_filename in vcf_filenames:
  vcfs.append(VariantFile(vcf_filename))

In [12]:
# cols = df.columns
# for c in cols:
#   print(c)

In [13]:
# Suggested by Alba
print(f"Before AF & any_error filter:\t{df.shape}")
df = df[df["AF"] <= 0.01]
df = df[df["any_error"] == False]
print(f"After AF & any_error filter:\t{df.shape}")

Before AF & any_error filter:	(29456, 160)
After AF & any_error filter:	(13773, 160)


In [14]:
columns_to_drop = [
    "Path",
    # "sample",
    "samples",
    "CHR2",
    "END",
    "END2",
    "SOURCE",
    "name",
    "STRANDS",
    "CPX_INTERVALS",
    "PREDICTED_INTERGENIC",
    "PREDICTED_INTRONIC",
    "PREDICTED_INV_SPAN",
    "PREDICTED_LOF",
    "PREDICTED_MSV_EXON_OVERLAP",
    "PREDICTED_NEAREST_TSS",
    "PREDICTED_PARTIAL_EXON_DUP",
    "PREDICTED_PROMOTER",
    "PREDICTED_TSS_DUP",
    "PREDICTED_UTR",
    "PREDICTED_NONCODING_SPAN",
    "SVTYPE",
    "UNRESOLVED_TYPE",
    "EVIDENCE_FIX",
    "gnomad_v2.1_sv_SVID",
    "is_small_cnv",
    "family_id",
    "name_famid",
    "chrom_type_sample",
    "chrom_type_family",
    "is_cpx",
    "is_duplicated",
    "batch",
    "cohort",
    "ID",
    "sample_name",
    "result",
    "Reason_unsure_follow_up",
    "Notes",
    "reviewers",
    "tiebreaker_reviewed",
    "num_reviewers",
    "tiebreaker_reviewer",
    "tiebreaker_result",
    "SV_plots_reviewed",
    "PREDICTED_COPY_GAIN",
    "PREDICTED_BREAKEND_EXONIC",
    "PREDICTED_DUP_PARTIAL",
    "PREDICTED_INTRAGENIC_EXON_DUP",
    "keep_gq",
    "filter_flag",
    "CN_NUMBER",
    "CN_COUNT",
    "CN_FREQ",
    "CN_NONREF_COUNT",
    "CN_NONREF_FREQ",
    "MALE_CN_NUMBER",
    "MALE_CN_COUNT",
    "MALE_CN_FREQ",
    "MALE_CN_NONREF_COUNT",
    "MALE_CN_NONREF_FREQ",
    "MALE_N_HEMIREF",
    "MALE_N_HEMIALT",
    "MALE_FREQ_HEMIREF",
    "MALE_FREQ_HEMIALT",
    "FEMALE_CN_NUMBER",
    "FEMALE_CN_COUNT",
    "FEMALE_CN_FREQ",
    "FEMALE_CN_NONREF_COUNT",
    "FEMALE_CN_NONREF_FREQ",
    "is_de_novo",
    "gnomAD_V2_SVID",
    "gnomAD_V2_AF",
    "gnomAD_V2_AC_AF",
    "gnomAD_V2_AN_AF",
    "gnomAD_V2_AFR_AF",
    "gnomAD_V2_AMR_AF",
    "gnomAD_V2_EAS_AF",
    "gnomAD_V2_EUR_AF",
    "any_error",
    "median_coverage"
]
df = df.drop(columns=columns_to_drop)

In [15]:
df["RD_CN"] = pd.to_numeric(df["RD_CN"], errors="coerce")
df["RD_GQ"] = pd.to_numeric(df["RD_GQ"], errors="coerce")
df["paternal_rdcn"] = pd.to_numeric(df["paternal_rdcn"], errors="coerce")
df["maternal_rdcn"] = pd.to_numeric(df["maternal_rdcn"], errors="coerce")

In [16]:
df["result_final"] = df["result_final"].str.strip().str.lower().map({"no_unsure": False, "yes": True})

In [17]:
df["result_final"].value_counts(dropna=False)

result_final
False    11603
True      2170
Name: count, dtype: int64

In [18]:
def get_features_missing_info_dataframe(info_dict):
  tuples = [(key, value[0], value[1] if len(value) == 2 else 0, (value[1] if len(value) == 2 else 0) - value[0]) for key, value in info_dict.items()]
  return pd.DataFrame(tuples, columns=["Feature", "Before", "After", "Diff"])

In [19]:
# Fetch data for missing site-level information from the VCF files.
features_missing_info = {}
for column in df.columns:
  nan_count = df[column].isna().sum()
  if nan_count > 0:
    features_missing_info[column] = ([nan_count])

missing_features_df = get_features_missing_info_dataframe(features_missing_info)
# df = df.drop(columns=features_missing_info)

In [20]:
# df["svtype"].value_counts(dropna=False)

In [21]:
# This piece is highly inefficient.

def update_retrieving_missing_info(row):
  start = row["start"]
  stop = row["end"]
  sample_id = row["sample"]

  cols_with_missing_features = row[row.isna()].index.tolist()
  if len(cols_with_missing_features) == 0:
    return row

  d=False

  for vcf in vcfs:
    samples_in_vcf = list(vcf.header.samples)  # need to use this cos `sample_id in variant.samples` will through key error if sample does not exist in the vcf.
    iterator = vcf.fetch(row["chrom"], row["start"], row["end"])
    for variant in iterator:
      if variant.start == start and variant.stop == stop:
        for c in cols_with_missing_features:
          if variant.info is not None and c in variant.info and not pd.isna(variant.info[c]):
            if row[c] is None or pd.isna(row[c]):
              row[c] = variant.info[c]
          elif sample_id in samples_in_vcf and c in variant.samples[sample_id]:
            v = variant.samples[sample_id][c]
            if v is not None and not pd.isna(v):
              if row[c] is None or pd.isna(row[c]):
                row[c] = v
        break
  return row

#df = df.apply(update_retrieving_missing_info, axis=1)
new_rows = []
if not skip_retrieving_missing_measurements:
  for _, row in tqdm(df.iterrows(), total=df.shape[0]):
      new_row = update_retrieving_missing_info(row)
      new_rows.append(new_row)
  df = pd.DataFrame(new_rows)

100%|██████████| 13773/13773 [38:24<00:00,  5.98it/s]


In [22]:
df["gnomad_v2.1_sv_AF"] = df["gnomad_v2.1_sv_AF"].fillna(0)
df["gnomad_v2.1_sv_AFR_AF"] = df["gnomad_v2.1_sv_AFR_AF"].fillna(0)
df["gnomad_v2.1_sv_AMR_AF"] = df["gnomad_v2.1_sv_AMR_AF"].fillna(0)
df["gnomad_v2.1_sv_EAS_AF"] = df["gnomad_v2.1_sv_EAS_AF"].fillna(0)
df["gnomad_v2.1_sv_EUR_AF"] = df["gnomad_v2.1_sv_EUR_AF"].fillna(0)
df["BOTHSIDES_SUPPORT"] = df["BOTHSIDES_SUPPORT"].fillna(False)
df["HIGH_SR_BACKGROUND"] = df["HIGH_SR_BACKGROUND"].fillna(False)
df["PESR_GT_OVERDISPERSION"] = df["PESR_GT_OVERDISPERSION"].fillna(False)

  df["BOTHSIDES_SUPPORT"] = df["BOTHSIDES_SUPPORT"].fillna(False)
  df["HIGH_SR_BACKGROUND"] = df["HIGH_SR_BACKGROUND"].fillna(False)
  df["PESR_GT_OVERDISPERSION"] = df["PESR_GT_OVERDISPERSION"].fillna(False)


In [23]:
features_missing_info_after = copy.deepcopy(features_missing_info)
for column in df.columns:
  nan_count = df[column].isna().sum()
  if nan_count > 0:
    features_missing_info_after[column].append(nan_count)

missing_features_df = get_features_missing_info_dataframe(features_missing_info_after)

In [24]:
missing_features_df

Unnamed: 0,Feature,Before,After,Diff
0,CPX_TYPE,12804,12804,0
1,PREDICTED_NONCODING_BREAKPOINT,2453,2453,0
2,gnomad_v2.1_sv_AF,11866,0,-11866
3,gnomad_v2.1_sv_AFR_AF,11866,0,-11866
4,gnomad_v2.1_sv_AMR_AF,11866,0,-11866
5,gnomad_v2.1_sv_EAS_AF,11866,0,-11866
6,gnomad_v2.1_sv_EUR_AF,11866,0,-11866
7,RD_CN,4496,4496,0
8,RD_GQ,4496,4496,0
9,PE_GQ,3621,3621,0


In [25]:
def get_obj_type_columns(dataframe):
  obj_cols = []
  dtypes = dataframe.dtypes
  for column, dtype in dtypes.items():
    # print(f"{dtype}\t{column}")
    if dtype == object:
      obj_cols.append(column)
  return obj_cols

In [26]:
object_type_columns = get_obj_type_columns(df)

In [27]:
def get_numerical_fixed_columns(dataframe):
  dataframe["PE_GQ"] = pd.to_numeric(dataframe["PE_GQ"], errors="coerce")
  dataframe["PE_GT"] = pd.to_numeric(dataframe["PE_GT"], errors="coerce")
  dataframe["SR_GQ"] = pd.to_numeric(dataframe["SR_GQ"], errors="coerce")
  dataframe["SR_GT"] = pd.to_numeric(dataframe["SR_GT"], errors="coerce")
  dataframe["paternal_pegq"] = pd.to_numeric(dataframe["paternal_pegq"], errors="coerce")
  dataframe["maternal_pegq"] = pd.to_numeric(dataframe["maternal_pegq"], errors="coerce")
  dataframe["paternal_srgq"] = pd.to_numeric(dataframe["paternal_srgq"], errors="coerce")
  dataframe["maternal_srgq"] = pd.to_numeric(dataframe["maternal_srgq"], errors="coerce")
  return dataframe

df = get_numerical_fixed_columns(df)

In [28]:
df["result_final"] = df["result_final"].replace({"yes": 1, "no_unsure": 0})

In [29]:
def get_nan_resolved_columns(dataframe):
  dataframe["gnomad_v2.1_sv_AF"] = dataframe["gnomad_v2.1_sv_AF"].fillna(0)
  dataframe["gnomad_v2.1_sv_AFR_AF"] = dataframe["gnomad_v2.1_sv_AFR_AF"].fillna(0)
  dataframe["gnomad_v2.1_sv_AMR_AF"] = dataframe["gnomad_v2.1_sv_AMR_AF"].fillna(0)
  dataframe["gnomad_v2.1_sv_EAS_AF"] = dataframe["gnomad_v2.1_sv_EAS_AF"].fillna(0)
  dataframe["gnomad_v2.1_sv_EUR_AF"] = dataframe["gnomad_v2.1_sv_EUR_AF"].fillna(0)
  return dataframe

df = get_nan_resolved_columns(df)

In [30]:
def impute_column(dataframe, column):
  imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
  dataframe[column] = imputer.fit_transform(dataframe[[column]])
  return dataframe

# should this for CNV only?
df = impute_column(df, "paternal_pegq")
df = impute_column(df, "maternal_pegq")
df = impute_column(df, "paternal_srgq")
df = impute_column(df, "maternal_srgq")

In [31]:
def get_cols_containing_nan_values(dataframe, exclude_cols = ["result_final"]):
  cols_with_nan = []
  for column in dataframe.columns:
    nan_count = dataframe[column].isna().sum()
    if nan_count > 0 and column not in exclude_cols:
      cols_with_nan.append([column, nan_count])
  return cols_with_nan

In [32]:
def print_missing_cols_summary(dataframe):
  cols_with_nan = get_cols_containing_nan_values(dataframe)
  if len(cols_with_nan) == 0:
    print("No columns with missing values!")
  else:
    for x in cols_with_nan:
      print(x)

In [33]:
print_missing_cols_summary(df)
print(f"Columns with obj type: {get_obj_type_columns(df)}")

['CPX_TYPE', 12804]
['PREDICTED_NONCODING_BREAKPOINT', 2453]
['RD_CN', 4496]
['RD_GQ', 4496]
['PE_GQ', 3953]
['PE_GT', 3953]
['SR_GQ', 3953]
['SR_GT', 3953]
['paternal_rdcn', 4496]
['maternal_rdcn', 4496]
Columns with obj type: ['chrom', 'svtype', 'sample', 'ALGORITHMS', 'CPX_TYPE', 'EVIDENCE', 'PREDICTED_NONCODING_BREAKPOINT', 'FILTER', 'GT', 'EV']


In [34]:
cnv_only_condition = (df["svtype"] == "DEL") | (df["svtype"] == "DUP")
depth_only_condition = (df["EVIDENCE"] == "RD") | (df[["PE_GQ", "PE_GT", "SR_GQ", "SR_GT"]].isna().any(axis=1))
cpx_condition = df["svtype"] == "CPX"
cnv_only_df = df[(cnv_only_condition) & ~(depth_only_condition)].copy()
depth_only_cnv_df = df[(cnv_only_condition) & (depth_only_condition)].copy()
non_cnv_only_df = df[(~cnv_only_condition) & (~cpx_condition)].copy()
cpx_only_df = df[(~cnv_only_condition) & cpx_condition].copy()

Here we first split the dataframe to three dataframes containing,
cnv-only, depth-only-cnv-only, and non-cnv variants.
Then perform all the formatting operations outlined in the following on
each of the dataframes separately. An alternative to this approach is
to perform all the required formatting on the input dataframe,
and only split the variants in the three groups after all
the formatting is complete.

However, the drawback of the latter is that dataframes may end up having
categories/features that does not belong to that group, whose value is
the same across all the variants in that group.
Developed models may converge to ignore such features,
but it is generally a better practice to only include the features
that are expected to have a functional impact on the outcome.

Hence, we first split the variants in three groups, then
format each group/dataframe separately.

In [35]:
print(f"All input:                    {df.shape}")
print(f"Non-CNVs:                     {non_cnv_only_df.shape}")
print(f"Only CNVs without depth-only: {cnv_only_df.shape}")
print(f"Only CNVs only    depth-only: {depth_only_cnv_df.shape}")
print(f"Only CPX:                     {cpx_only_df.shape}")

assert cnv_only_df.shape[0] + depth_only_cnv_df.shape[0] + non_cnv_only_df.shape[0] + cpx_only_df.shape[0] == df.shape[0]
assert cnv_only_df.shape[1] == depth_only_cnv_df.shape[1] == non_cnv_only_df.shape[1] == df.shape[1] == cpx_only_df.shape[1]

All input:                    (13773, 80)
Non-CNVs:                     (3684, 80)
Only CNVs without depth-only: (5202, 80)
Only CNVs only    depth-only: (3924, 80)
Only CPX:                     (963, 80)


In [36]:
def get_multi_class_binarized(dataframe, column_label, delimiter=",", new_column_label_prefix=None):
  if new_column_label_prefix is None:
    new_column_label_prefix = column_label

  dataframe[column_label] = dataframe[column_label].apply(
      lambda x: [] if pd.isna(x) else x.split(delimiter)).to_list()

  binarizer = MultiLabelBinarizer()
  binarized = binarizer.fit_transform(dataframe[column_label])
  binarized_df = pd.DataFrame(
      binarized,
      columns=[f"{new_column_label_prefix}_{class_label}" for class_label in binarizer.classes_])

  dataframe = dataframe.drop([column_label], axis=1)
  dataframe.reset_index(drop=True, inplace=True)
  return pd.concat([dataframe, binarized_df], axis=1)

# Convert columns with categorical values.
def get_multi_class_binarized_columns(dataframe):
  dataframe = get_multi_class_binarized(dataframe, "svtype", ":")
  dataframe = get_multi_class_binarized(dataframe, "ALGORITHMS")
  dataframe = get_multi_class_binarized(dataframe, "CPX_TYPE")
  dataframe = get_multi_class_binarized(dataframe, "PREDICTED_NONCODING_BREAKPOINT")
  dataframe = get_multi_class_binarized(dataframe, "FILTER")
  dataframe = get_multi_class_binarized(dataframe, "GT")
  dataframe = get_multi_class_binarized(dataframe, "EVIDENCE")
  dataframe = get_multi_class_binarized(dataframe, "EV")
  return dataframe

cnv_only_df = get_multi_class_binarized_columns(cnv_only_df)
non_cnv_only_df = get_multi_class_binarized_columns(non_cnv_only_df)
depth_only_cnv_df = get_multi_class_binarized_columns(depth_only_cnv_df)
cpx_only_df = get_multi_class_binarized_columns(cpx_only_df)

In [37]:
cpx_only_df = impute_column(cpx_only_df, "PE_GQ")
cpx_only_df = impute_column(cpx_only_df, "PE_GT")
cpx_only_df = impute_column(cpx_only_df, "SR_GQ")
cpx_only_df = impute_column(cpx_only_df, "SR_GT")
cpx_only_df = impute_column(cpx_only_df, "paternal_pegq")
cpx_only_df = impute_column(cpx_only_df, "maternal_pegq")
cpx_only_df = impute_column(cpx_only_df, "paternal_srgq")
cpx_only_df = impute_column(cpx_only_df, "maternal_srgq")

In [38]:
non_cnv_only_df = non_cnv_only_df.drop(columns=["RD_CN", "RD_GQ", "paternal_rdcn", "maternal_rdcn"])
cpx_only_df = cpx_only_df.drop(columns=["RD_CN", "RD_GQ", "paternal_rdcn", "maternal_rdcn"])
depth_only_cnv_df = depth_only_cnv_df.drop(columns=["PE_GQ", "PE_GT", "SR_GQ", "SR_GT", "paternal_pegq", "maternal_pegq", "paternal_srgq", "maternal_srgq"])

In [39]:
# cnv_only_df = get_numerical_fixed_columns(cnv_only_df)
# depth_only_cnv_df = get_numerical_fixed_columns(depth_only_cnv_df)
# non_cnv_only_df = get_numerical_fixed_columns(non_cnv_only_df)
# cpx_only_df = get_numerical_fixed_columns(cpx_only_df)

In [40]:
# def get_start_end_replaced_with_length(dataframe):
#   dataframe.insert(1, "length", dataframe["end"] - dataframe["start"])
#   dataframe = dataframe.drop(["start", "end"], axis=1)
#   return dataframe

# cnv_only_df = get_start_end_replaced_with_length(cnv_only_df)
# depth_only_cnv_df = get_start_end_replaced_with_length(depth_only_cnv_df)
# non_cnv_only_df = get_start_end_replaced_with_length(non_cnv_only_df)
# cpx_only_df = get_start_end_replaced_with_length(cpx_only_df)

In [41]:
# As per discussion with Harrison, we decided to drop all the columns
# that still contain NaN after all the previous adjustments.
def drop_nan_cols(dataframe):
  cols_with_nan = get_cols_containing_nan_values(dataframe)
  cols = [x[0] for x in cols_with_nan]

  rows_with_nan = dataframe[dataframe[cols].isna().any(axis=1)]

  dataframe = dataframe.drop(columns=cols)

  print("Dropping the following columns:")
  for x in cols_with_nan:
    print(x)
  return dataframe, rows_with_nan

In [42]:
# print("Dropping cols from CNV only:")
# cnv_only_df, cnv_only_dropped_rows_df = drop_nan_cols(cnv_only_df)

# print("\nDropping cols from depth-only CNV only:")
# depth_only_cnv_df, depth_only_cnv_dropped_rows_df = drop_nan_cols(depth_only_cnv_df)

# print("\nDropping cols from non-CNV only:")
# non_cnv_only_df, non_cnv_only_dropped_rows_df = drop_nan_cols(non_cnv_only_df)

# print("\nDropping cols from CPX only:")
# cpx_only_df, cpx_only_dropped_rows_df = drop_nan_cols(cpx_only_df)

In [43]:
print_missing_cols_summary(cnv_only_df)
print(f"Columns with obj type: {get_obj_type_columns(cnv_only_df)}")

No columns with missing values!
Columns with obj type: ['chrom', 'sample']


In [44]:
print_missing_cols_summary(depth_only_cnv_df)
print(f"Columns with obj type: {get_obj_type_columns(depth_only_cnv_df)}")

No columns with missing values!
Columns with obj type: ['chrom', 'sample']


In [45]:
print_missing_cols_summary(non_cnv_only_df)
print(f"Columns with obj type: {get_obj_type_columns(non_cnv_only_df)}")

No columns with missing values!
Columns with obj type: ['chrom', 'sample']


In [46]:
print_missing_cols_summary(cpx_only_df)
print(f"Columns with obj type: {get_obj_type_columns(cpx_only_df)}")

No columns with missing values!
Columns with obj type: ['chrom', 'sample']


In [47]:
# _ = to_csv(non_cnv_only_dropped_rows_df, "non_cnv_only_dropped_rows", False)
# _ = to_csv(cnv_only_dropped_rows_df, "cnv_only_dropped_rows", False)
# _ = to_csv(depth_only_cnv_dropped_rows_df, "depth_only_cnv_only_dropped_rows", False)
# _ = to_csv(cpx_only_dropped_rows_df, "cpx_only_dropped_rows", False)

In [48]:
print(f"All input:                    {df.shape}")
print(f"Non-CNVs:                     {non_cnv_only_df.shape}")
print(f"Only CNVs without depth-only: {cnv_only_df.shape}")
print(f"Only CNVs only    depth-only: {depth_only_cnv_df.shape}")
print(f"CPX only:                     {cpx_only_df.shape}")

assert cnv_only_df.shape[0] + depth_only_cnv_df.shape[0] + non_cnv_only_df.shape[0] + cpx_only_df.shape[0] == df.shape[0]
# assert cnv_only_df.shape[1] == depth_only_cnv_df.shape[1] == non_cnv_only_df.shape[1] == df.shape[1]

All input:                    (13773, 80)
Non-CNVs:                     (3684, 93)
Only CNVs without depth-only: (5202, 95)
Only CNVs only    depth-only: (3924, 86)
CPX only:                     (963, 100)


In [49]:
def to_csv(dataframe, filename_postfix, push_to_bucket=True):
  output_filename = f"{Path(filename).stem}_{filename_postfix}.csv"
  output_filename_local = os.path.join(working_dir, output_filename)
  dataframe.to_csv(output_filename_local, index=False, sep=",")

  if push_to_bucket:
    copy_object_to_bucket(bucket_name, f"{base_blob_name}/{output_filename}", output_filename_local)

  return output_filename_local

In [50]:
non_cnv_only_filename = to_csv(non_cnv_only_df, "non_cnv_only")
cnv_only_filename = to_csv(cnv_only_df, "cnv_only")
depth_only_cnv_filename = to_csv(depth_only_cnv_df, "depth_only_cnv_only")
cpx_only_filename = to_csv(cpx_only_df, "cpx_only")