<a href="https://colab.research.google.com/github/VJalili/denovo/blob/main/denovo_filtering_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [158]:
# !gcloud init

In [159]:
import os
import pandas as pd
import gzip

from dataclasses import dataclass, field
from google.cloud import storage
from typing import Callable, Any, Dict, Tuple, List

In [160]:
storage_client = storage.Client(project="broad-dsde-methods")

# This method uses the Google cloud storage library which performs faster
# calling gsutil using `check_output`.
def download_blob(bucket_name, blob_name, local_filename):
  bucket = storage_client.get_bucket(bucket_name)
  blob = bucket.blob(blob_name)
  blob.download_to_filename(local_filename)

def copy_object_to_bucket(bucket_name, blob_name, source_filename):
  bucket = storage_client.get_bucket(bucket_name)
  blob = bucket.blob(blob_name)
  blob.upload_from_filename(source_filename)

In [161]:
working_dir = os.path.join(".", "denovo")
if not os.path.exists(working_dir):
  os.makedirs(working_dir)

In [162]:
def resolve_filename(filename):
  local_filename = os.path.join(working_dir, filename)
  if not os.path.isfile(local_filename):
    download_blob("broad-dsde-methods-vj", f"denovo-sv/{filename}", local_filename)
    print(f"Localized {filename}")
  else:
    print(f"File is already localized, filename: {local_filename}")
  return local_filename

denovo_svs_filename = resolve_filename("GMKF-OFC-denovo-SV-Master-20240314.txt")

File is already localized, filename: ./denovo/GMKF-OFC-denovo-SV-Master-20240314.txt


In [163]:
df = pd.read_csv(denovo_svs_filename, sep="\t")
df.shape

  df = pd.read_csv(denovo_svs_filename, sep="\t")


(20235, 145)

In [164]:
# Columns that need casting, use the above print for the list of columns.
cols = df.columns
print(f"{cols[106]}\t{cols[107]}\t{cols[114]}\t{cols[115]}\t{cols[126]}\t{cols[130]}")

RD_CN	RD_GQ	paternal_rdcn	maternal_rdcn	median_coverage	is_de_novo


In [165]:
columns_to_drop = [
    "Path", "sample", "samples", "CHR2", "END", "END2", "SOURCE",
    "name", "STRANDS",
    "PREDICTED_COPY_GAIN",
    "PREDICTED_BREAKEND_EXONIC",
    "PREDICTED_DUP_PARTIAL",
    "PREDICTED_INTRAGENIC_EXON_DUP",
    "PREDICTED_INV_SPAN",
    "PREDICTED_LOF",
    "PREDICTED_MSV_EXON_OVERLAP",
    "PREDICTED_NEAREST_TSS",
    "PREDICTED_PARTIAL_EXON_DUP",
    "PREDICTED_PROMOTER",
    "PREDICTED_TSS_DUP",
    "PREDICTED_UTR",
    "UNRESOLVED_TYPE"]
df = df.drop(columns=columns_to_drop)

In [166]:
df["RD_CN"] = pd.to_numeric(df["RD_CN"], errors="coerce")
df["RD_GQ"] = pd.to_numeric(df["RD_GQ"], errors="coerce")
df["paternal_rdcn"] = pd.to_numeric(df["paternal_rdcn"], errors="coerce")
df["maternal_rdcn"] = pd.to_numeric(df["maternal_rdcn"], errors="coerce")
df["median_coverage"] = pd.to_numeric(df["median_coverage"], errors="coerce")
df["is_de_novo"] = df["is_de_novo"].astype("bool")

In [167]:
dtypes = df.dtypes
for i in range(len(dtypes)):
  print(f"{i}\t{dtypes[i]}\t{df.columns[i]}")

0	object	chrom
1	int64	start
2	int64	end
3	object	svtype
4	int64	AC
5	object	ALGORITHMS
6	int64	AN
7	object	CPX_INTERVALS
8	object	CPX_TYPE
9	object	EVIDENCE
10	bool	PREDICTED_INTERGENIC
11	object	PREDICTED_INTRONIC
12	object	PREDICTED_NONCODING_BREAKPOINT
13	object	PREDICTED_NONCODING_SPAN
14	int64	SVLEN
15	object	SVTYPE
16	float64	AF
17	int64	N_BI_GENOS
18	int64	N_HOMREF
19	int64	N_HET
20	int64	N_HOMALT
21	float64	FREQ_HOMREF
22	float64	FREQ_HET
23	float64	FREQ_HOMALT
24	float64	CN_NUMBER
25	float64	CN_COUNT
26	float64	CN_FREQ
27	float64	CN_NONREF_COUNT
28	float64	CN_NONREF_FREQ
29	int64	MALE_AN
30	int64	MALE_AC
31	float64	MALE_AF
32	int64	MALE_N_BI_GENOS
33	int64	MALE_N_HOMREF
34	int64	MALE_N_HET
35	int64	MALE_N_HOMALT
36	float64	MALE_FREQ_HOMREF
37	float64	MALE_FREQ_HET
38	float64	MALE_FREQ_HOMALT
39	float64	MALE_CN_NUMBER
40	float64	MALE_CN_COUNT
41	float64	MALE_CN_FREQ
42	float64	MALE_CN_NONREF_COUNT
43	float64	MALE_CN_NONREF_FREQ
44	float64	MALE_N_HEMIREF
45	float64	MALE_N_HEMIA

In [168]:
# df["PREDICTED_COPY_GAIN"].dtype

In [169]:
df["UNRESOLVED_TYPE"].value_counts()

KeyError: 'UNRESOLVED_TYPE'

In [None]:
df["UNRESOLVED_TYPE"]

In [None]:
df["RD_CN"].max()

In [None]:
df["RD_CN"].hist()

In [None]:
rd_cn_df = df[df["RD_CN"] > 100]
rd_cn_df.shape

In [None]:
rd_cn_df