In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
pd.options.display.max_columns = None
%matplotlib inline
pd.options.display.max_rows = 1000

from venn import venn

In [51]:
tel_genes_list = pd.read_csv ("../immortal_simon/annovar/TelNet_2093GeneList.csv")
#Telnet telomere related gene list

In [244]:
tel_genes_list[tel_genes_list["Gene symbol"]=="TERC"]

Unnamed: 0,Gene symbol,Gene ID,Specie,HGNC ID,Ensemble ID,Refseq ID,Uniprot ID,Cell functunction,TM functions,Predictied TMM,Predictied TMM_2,TMM significance,TelNet Score
2076,TERC,7012,H. sapiens (human),NR_001566;,ENSG00000277925,HGNC:11727,n.a.,Telomere biology,Telomerase component; Telomere length; Telomer...,ALT: ambiguous; Telomerase-mediated: enhancing,Telomerase-mediated,validated,10


In [53]:
top_tel_genes_list =tel_genes_list[tel_genes_list["TelNet Score"]>=3].sort_values(by=["Gene symbol"])
# Top 267 telomere-related genes (TelNet score >=3, max=10)

In [259]:
mg_df = pd.read_csv("../immortal_simon/annovar/annovar_mutect_gridss.tsv", sep="\t")
# this data cantains all somatic mutations and SVs, as well as the germline exonic SVs of JFCF-6 mortal cell
# Just SVs in this data is filtered with Bedtool to focus on exonic data, but at this step all feature_fusion SVs were kept.
# this data still contains all types of somatic short mutations, including 5/3 UTR, down/up-stream...
# exonic in ANNOVAR just means the coding region of a exon (excluding UTR5/3)
# ncRNA_exonic in ANNOVAR indicates the "coding" region of a ncRNA exon (excluding UTR5/3)
# it is notable that this data contains both proteins and ncRNAs' exonic short mutations and SVs

In [260]:
mg_df.shape

(50495, 70)

In [289]:
mg_df[mg_df["Key"].str.contains("TERT")]
# no TERT mutation

Unnamed: 0,Key,Name,Source,dbSNP,Type,Chr1,Pos1,Chr2,Pos2,Qual1,Qual2,AF1,AF2,Read1,Read2,Gene,Consequence,Impact,Ref1,Alt1,Ref2,Alt2,exon_count,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,cytoBand,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS,avsnp147,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,fathmm-MKL_coding_pred,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds


In [None]:
mg_df["Type"].unique()
# ncRNA_exonic indicates the exon region of the ncRNA genes
# ncRNA_splicing indicates the splicing region of the ncRNA genes

In [345]:
mg_df[(mg_df["Type"]=="UTR3")&(mg_df["Name"]!="JFCF_6")]["Gene"].unique().shape
# 125 genes with somatic 5UTR mutation; 871 genes with somtic 3UTR mutation  

(871,)

In [118]:
mg_df["ExonicFunc.refGene"].unique()
# "Type" and "ExonicFunc.refGene", these two columns indicate mutation types from GRIDSS and MUTECT2 seperately

array([nan, '.', 'nonsynonymous SNV', 'synonymous SNV', 'stopgain',
       'frameshift deletion', 'nonframeshift substitution', 'unknown',
       'nonframeshift insertion', 'nonframeshift deletion',
       'frameshift insertion', 'stoploss'], dtype=object)

In [119]:
mg_df["Impact"].unique()

array(['HIGH', 'LOW', nan], dtype=object)

In [237]:
Type_list = ['Translocations', 'Insertion', 'Deletion', 'Duplication',
       'Inversion','exonic','splicing','ncRNA_exonic','ncRNA_splicing']
# do not include ncRNA data

In [238]:
sm_mg_df = mg_df[(mg_df['Name']!="JFCF_6")&(~mg_df['Key'].str.contains("Immortal"))&(mg_df['Type'].isin(Type_list))
                 &(mg_df['Consequence']!="feature_fusion")&(mg_df['ExonicFunc.refGene']!="synonymous SNV")]
#remove JFCF-6 germline SVs,feature_fusion SVs, duplicated somatic results, synonymous mutations and non-exonic/splicing items  
#thus, the rest data just includes exonic/splicing somatic mutations and high impact somatic SVs
# it is notable that this data contains both protein and ncRNA exonic short mutations and SVs
# Somatic: exon/splicing short mutation + exonic SVs 
sm_mg_df.shape

(2094, 70)

In [263]:
mg_df[(mg_df['Name']!="JFCF_6")&(~mg_df['Key'].str.contains("Immortal"))
      &(mg_df['Consequence']!="feature_fusion")&(mg_df['Gene'].str.contains("TCOF1"))]
# somatic TCOF1 SVs and short mutations

Unnamed: 0,Key,Name,Source,dbSNP,Type,Chr1,Pos1,Chr2,Pos2,Qual1,Qual2,AF1,AF2,Read1,Read2,Gene,Consequence,Impact,Ref1,Alt1,Ref2,Alt2,exon_count,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,cytoBand,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS,avsnp147,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,fathmm-MKL_coding_pred,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds
21939,SV_GRIDSS_HIGH_TCOF1_Somatic_chr5_149759805_14...,JFCF_6_T_1_M,GRIDSS,False,Duplication,chr5,149759805-149759808,chr5,149945464-149945467,NotSplit:74.22;Split:108.66,NotSplit:74.22;Split:108.66,,,NotSplit:4;Split:4,NotSplit:4;Split:4,TCOF1,transcript_ablation,HIGH,A,]CHR5:149945466]A,G,G[CHR5:149759807[,150.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [249]:
sm_mg_df[sm_mg_df['Gene'].str.contains("ATRX")]

Unnamed: 0,Key,Name,Source,dbSNP,Type,Chr1,Pos1,Chr2,Pos2,Qual1,Qual2,AF1,AF2,Read1,Read2,Gene,Consequence,Impact,Ref1,Alt1,Ref2,Alt2,exon_count,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,cytoBand,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS,avsnp147,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,fathmm-MKL_coding_pred,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds
12951,SV_GRIDSS_HIGH_ATRX&KRBA1_Somatic_chr7_1494188...,JFCF_6_T_5K,GRIDSS,False,Translocations,chr7,149418856-149418857,chrX,76917250-76917251,NotSplit:0;Split:92.74,NotSplit:0;Split:92.74,,,NotSplit:0;Split:3,NotSplit:0;Split:3,ATRX&KRBA1,bidirectional_gene_fusion,HIGH,G,]CHRX:76917251]GAAAGG,A,AGAAAG[CHR7:149418857[,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
12953,SV_GRIDSS_HIGH_ATRX&SSPO_Somatic_chr7_14947961...,JFCF_6_T_5K,GRIDSS,False,Translocations,chr7,149479619-149479624,chrX,76783762-76783767,NotSplit:91.63;Split:132.7,NotSplit:91.63;Split:132.7,,,NotSplit:5;Split:5,NotSplit:5;Split:5,ATRX&SSPO,gene_fusion&frameshift_variant,HIGH,A,A]CHRX:76783765],C,C]CHR7:149479622],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
12955,SV_GRIDSS_HIGH_ATRX&SSPO_Somatic_chr7_14947963...,JFCF_6_T_5K,GRIDSS,False,Translocations,chr7,149479631-149479633,chrX,76917220-76917222,NotSplit:0;Split:88.27,NotSplit:0;Split:88.27,,,NotSplit:0;Split:3,NotSplit:0;Split:3,ATRX&SSPO,gene_fusion,HIGH,A,[CHRX:76917222[A,A,[CHR7:149479632[A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
12957,SV_GRIDSS_HIGH_ATRX&SSPO_Somatic_chr7_14947995...,JFCF_6_T_5K,GRIDSS,False,Translocations,chr7,149479955-149479956,chrX,76782881-76782882,NotSplit:109.96;Split:133.03,NotSplit:109.96;Split:133.03,,,NotSplit:6;Split:5,NotSplit:6;Split:5,ATRX&SSPO,bidirectional_gene_fusion,HIGH,A,AAG[CHRX:76782882[,C,]CHR7:149479956]AGC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
39233,SV_GRIDSS_HIGH_ATRX_Somatic_chrX_76885052_7688...,JFCF_6_T_1_M,GRIDSS,False,Duplication,chrX,76885052-76885055,chrX,76923999-76924002,NotSplit:204.12;Split:318.66,NotSplit:204.12;Split:318.66,,,NotSplit:11;Split:12,NotSplit:11;Split:12,ATRX,gene_fusion&frameshift_variant,HIGH,G,]CHRX:76924001]G,A,A[CHRX:76885054[,18.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
41378,SV_GRIDSS_HIGH_ATRX_Somatic_chrX_76808603_7680...,JFCF_6_T_1_Q,GRIDSS,False,Deletion,chrX,76808603-76808606,chrX,76875533-76875536,NotSplit:141.33;Split:96.28,NotSplit:141.33;Split:96.28,,,NotSplit:8;Split:4,NotSplit:8;Split:4,ATRX,gene_fusion&frameshift_variant,HIGH,C,C[CHRX:76875535[,C,]CHRX:76808605]C,20.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
43279,SV_GRIDSS_HIGH_ATRX_Somatic_chrX_76862268_7686...,JFCF_6_T_1_D,GRIDSS,False,Deletion,chrX,76862268-76862269,chrX,77034579-77034580,NotSplit:249.27;Split:0,NotSplit:249.27;Split:0,,,NotSplit:14;Split:0,NotSplit:14;Split:0,ATRX,gene_fusion&frameshift_variant,HIGH,A,AGGTA[CHRX:77034580[,T,]CHRX:76862269]GGTAT,41.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
44757,SV_GRIDSS_HIGH_ATRX_Somatic_chrX_76853017_7685...,JFCF_6_T_1J_11E,GRIDSS,False,Deletion,chrX,76853017-76853018,chrX,77014863-77014864,NotSplit:239.33;Split:302.85,NotSplit:239.33;Split:302.85,,,NotSplit:13;Split:12,NotSplit:13;Split:12,ATRX,gene_fusion&frameshift_variant,HIGH,A,A[CHRX:77014864[,A,]CHRX:76853018]A,47.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
44758,SV_GRIDSS_HIGH_ATRX_Somatic_chrX_76853017_7685...,JFCF_6_T_1J_1_3C,GRIDSS,False,Deletion,chrX,76853017-76853018,chrX,77014863-77014864,NotSplit:343.72;Split:285.82,NotSplit:343.72;Split:285.82,,,NotSplit:19;Split:11,NotSplit:19;Split:11,ATRX,gene_fusion&frameshift_variant,HIGH,A,A[CHRX:77014864[,A,]CHRX:76853018]A,47.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
45132,SV_GRIDSS_HIGH_ATRX_Somatic_chrX_76795540_7679...,JFCF_6_T_1_L,GRIDSS,False,Deletion,chrX,76795540-76795542,chrX,76940486-76940488,NotSplit:237.43;Split:194.13,NotSplit:237.43;Split:194.13,,,NotSplit:13;Split:7,NotSplit:13;Split:7,ATRX,gene_fusion,HIGH,T,T[CHRX:76940487[,C,]CHRX:76795541]C,46.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [310]:
sm_mg_df["Gene"].unique().shape
# 1342 mutated genes in total

(1342,)

In [285]:
sm_mg_df["Gene"][sm_mg_df['Source']=='MuTect2'].unique().shape
# 847 genes with exonic/splicing short mutations (no UTR3/5 mutations)

(847,)

In [337]:
sm_mg_df["Gene"][sm_mg_df['Source']=='GRIDSS'].unique().shape
# 528 genes with exonic SVs

(528,)

In [309]:
sm_mg_df[(sm_mg_df['Type'].str.contains("splicing"))]["Gene"].unique().shape
# 27 genes with somatic splicing short mutations

(27,)

In [300]:
sm_mg_df[(sm_mg_df['Type'].str.contains("splicing"))&(sm_mg_df['Gene'].isin(tel_genes_list["Gene symbol"]))]
#For CDK13 (telomere-related gene), 3-AG loss at splicing site

Unnamed: 0,Key,Name,Source,dbSNP,Type,Chr1,Pos1,Chr2,Pos2,Qual1,Qual2,AF1,AF2,Read1,Read2,Gene,Consequence,Impact,Ref1,Alt1,Ref2,Alt2,exon_count,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,cytoBand,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS,avsnp147,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,fathmm-MKL_coding_pred,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds
50256,MuTect2_splicing_CDK13_JFCF_61Q_chr7_40127722_...,JFCF_6_T_1_Q,MuTect2,,splicing,chr7,40127722,,,,,,,,,CDK13,,,CAG,TTT,,,,.,.,.,7p14.1,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.
50409,MuTect2_splicing_ATAD5_JFCF_61Q_chr17_29192720...,JFCF_6_T_1_Q,MuTect2,,splicing,chr17,29192720,,,,,,,,,ATAD5,,,A,T,,,,NM_024857:exon11:c.3137-2A>T,.,.,17q11.2,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1,D,.,.,.,.,.,.,.,4.327,24.0,0.987,0.981,D,.,.,.,.,0.164,0,5.38,1.062,1.178,0.986,0.987,15.48


In [311]:
sm_mg_df["Type"].unique()

array(['Translocations', 'Deletion', 'Duplication', 'Inversion',
       'Insertion', 'ncRNA_exonic', 'ncRNA_splicing', 'exonic',
       'splicing'], dtype=object)

In [190]:
sm_mg_tel= sm_mg_df[(sm_mg_df['Gene'].isin(tel_genes_list["Gene symbol"]))]

In [194]:
sm_mg_tel.sort_values(by=['Gene'])["Gene"].unique().shape
# 87 teloemre-related genes (exonic SVs + exonic/splicing short mutations)

(87,)

In [195]:
sm_mg_tel["Gene"][sm_mg_tel["Source"]=="MuTect2"].sort_values().unique().shape
# 55 genes with exonic/splicing short mutations

(55,)

In [326]:
sm_mg_tel[sm_mg_tel["Source"]=="GRIDSS"]
# 40 genes with exonic SVs

Unnamed: 0,Key,Name,Source,dbSNP,Type,Chr1,Pos1,Chr2,Pos2,Qual1,Qual2,AF1,AF2,Read1,Read2,Gene,Consequence,Impact,Ref1,Alt1,Ref2,Alt2,exon_count,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,cytoBand,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS,avsnp147,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,fathmm-MKL_coding_pred,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds
423,SV_GRIDSS_HIGH_DMD_Somatic_chr7_118228129_1182...,JFCF_6_T_1_M,GRIDSS,False,Translocations,chr7,118228129-118228131,chrX,32056726-32056728,NotSplit:259.78;Split:137.5,NotSplit:259.78;Split:137.5,,,NotSplit:14;Split:5,NotSplit:14;Split:5,DMD,transcript_ablation,HIGH,G,G]CHRX:32056728],C,C]CHR7:118228130],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2455,SV_GRIDSS_HIGH_DMD_Somatic_chr4_191033031_1910...,JFCF_6_T_1_D,GRIDSS,False,Translocations,chr4,191033031-191033033,chrX,32765190-32765192,NotSplit:124.64;Split:79.03,NotSplit:124.64;Split:79.03,,,NotSplit:7;Split:3,NotSplit:7;Split:3,DMD,transcript_ablation,HIGH,C,C[CHRX:32765191[,T,]CHR4:191033032]T,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3125,SV_GRIDSS_HIGH_PLCH1_Somatic_chr3_155272695_15...,JFCF_6_T_1_F,GRIDSS,False,Translocations,chr3,155272695-155272698,chr8,111210121-111210124,NotSplit:238.3;Split:262.37,NotSplit:238.3;Split:262.37,,,NotSplit:13;Split:10,NotSplit:13;Split:10,PLCH1,transcript_ablation,HIGH,T,T[CHR8:111210123[,A,]CHR3:155272697]A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3195,SV_GRIDSS_HIGH_RB1CC1_Somatic_chr3_179269280_1...,JFCF_6_T_1_F,GRIDSS,False,Translocations,chr3,179269280-179269284,chr8,53557381-53557385,NotSplit:238.3;Split:77.95,NotSplit:238.3;Split:77.95,,,NotSplit:13;Split:3,NotSplit:13;Split:3,RB1CC1,transcript_ablation,HIGH,T,[CHR8:53557384[T,C,[CHR3:179269282[C,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3866,SV_GRIDSS_HIGH_EIF5B_Somatic_chr2_99984926_999...,JFCF_6_T_1_G,GRIDSS,False,Translocations,chr2,99984926-99984927,chr8,142588189-142588190,NotSplit:276.91;Split:193.16,NotSplit:276.91;Split:193.16,,,NotSplit:15;Split:7,NotSplit:15;Split:7,EIF5B,transcript_ablation,HIGH,G,]CHR8:142588190]TG,G,GT[CHR2:99984927[,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3878,SV_GRIDSS_HIGH_RHPN1_Somatic_chr2_102006805_10...,JFCF_6_T_1_G,GRIDSS,False,Translocations,chr2,102006805-102006806,chr8,144461255-144461256,NotSplit:166.15;Split:221.89,NotSplit:166.15;Split:221.89,,,NotSplit:9;Split:8,NotSplit:9;Split:8,RHPN1,transcript_ablation,HIGH,T,TGAAG]CHR8:144461256],C,CCTTC]CHR2:102006806],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3947,SV_GRIDSS_HIGH_EPHA4_Somatic_chr2_222383211_22...,JFCF_6_T_1_G,GRIDSS,False,Translocations,chr2,222383211-222383214,chr13,34639773-34639776,NotSplit:258.45;Split:153.39,NotSplit:258.45;Split:153.39,,,NotSplit:14;Split:6,NotSplit:14;Split:6,EPHA4,transcript_ablation,HIGH,G,G[CHR13:34639775[,T,]CHR2:222383213]T,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4341,SV_GRIDSS_HIGH_SYNE2_Somatic_chr10_2591323_259...,JFCF_6_T_1_G,GRIDSS,False,Translocations,chr10,2591323-2591326,chr14,64569986-64569989,NotSplit:258.45;Split:167.9,NotSplit:258.45;Split:167.9,,,NotSplit:14;Split:6,NotSplit:14;Split:6,SYNE2,transcript_ablation,HIGH,G,]CHR14:64569988]G,A,A[CHR10:2591325[,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10750,SV_GRIDSS_HIGH_DLG2_Somatic_chr11_84236652_842...,JFCF_6_T_1_P_TEL,GRIDSS,False,Translocations,chr11,84236652-84236657,chr18,14575202-14575207,NotSplit:183.5;Split:0,NotSplit:183.5;Split:0,,,NotSplit:10;Split:0,NotSplit:10;Split:0,DLG2,transcript_ablation,HIGH,T,[CHR18:14575205[T,A,[CHR11:84236655[A,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11009,SV_GRIDSS_HIGH_DHX35_Somatic_chr1_215008483_21...,JFCF_6_T_1_R,GRIDSS,False,Translocations,chr1,215008483-215008487,chr20,37645265-37645269,NotSplit:205.46;Split:152.34,NotSplit:205.46;Split:152.34,,,NotSplit:11;Split:6,NotSplit:11;Split:6,DHX35,transcript_ablation,HIGH,T,]CHR20:37645267]T,G,G[CHR1:215008485[,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
###############################################################

In [304]:
mortal_sv = mg_df[(mg_df['Name']=="JFCF_6")&(mg_df['Consequence']!="feature_fusion")]
#(mg_df['Gene'].isin(tel_genes_list["Gene symbol"]))&(mg_df['Impact']=="HIGH")
mortal_sv.shape

(9569, 70)

In [213]:
mortal_sv[mortal_sv["Key"].str.contains("chr11_65211733")]
# the same variant may be annotated with different types in differnt rows 

Unnamed: 0,Key,Name,Source,dbSNP,Type,Chr1,Pos1,Chr2,Pos2,Qual1,Qual2,AF1,AF2,Read1,Read2,Gene,Consequence,Impact,Ref1,Alt1,Ref2,Alt2,exon_count,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,cytoBand,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS,avsnp147,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,fathmm-MKL_coding_pred,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds
45681,SV_GRIDSS_HIGH_NEAT1_Mortal_chr11_65211733_652...,JFCF_6,GRIDSS,False,Deletion,chr11,65211733-65211766,chr11,65211744-65211777,NotSplit:0;Split:0,NotSplit:0;Split:0,,,NotSplit:0;Split:0,NotSplit:0;Split:0,NEAT1,gene_fusion&frameshift_variant,HIGH,G,G[CHR11:65211761[,T,]CHR11:65211750]T,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
45682,SV_GRIDSS_HIGH_NEAT1_Mortal_chr11_65211733_652...,JFCF_6,GRIDSS,False,Deletion,chr11,65211733-65211766,chr11,65211744-65211777,NotSplit:0;Split:0,NotSplit:0;Split:0,,,NotSplit:0;Split:0,NotSplit:0;Split:0,NEAT1,gene_fusion,HIGH,G,G[CHR11:65211761[,T,]CHR11:65211750]T,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [214]:
mortal_sv["Key"].unique().shape

(1847,)

In [305]:
mortal_sv["Gene"].unique().shape
# 1480 genes with exonic germline SVs in the mortal cell

(1480,)

In [307]:
mortal_sv_tel = mortal_sv[(mortal_sv['Gene'].isin(tel_genes_list["Gene symbol"]))]
mortal_sv_tel.shape

(602, 70)

In [308]:
mortal_sv_tel["Gene"].unique().shape
# 108 telomere-related genes with exonic germline SVs in the mortal cell

(108,)

In [312]:
mortal_sv_tel["Gene"].unique()

array(['RNF2', 'PRIM2', 'LPIN1', 'TSG101', 'MED12L', 'TENM3', 'ETHE1',
       'MACROD2', 'KLF12', 'THEMIS2', 'ARHGAP15', 'SP100', 'CALD1',
       'BCL7B', 'HNF4A', 'YY1', 'CDK11B', 'PIK3CD', 'PCBP1', 'TGFBRAP1',
       'GIGYF2', 'SCLY', 'PPARG', 'PSMD6', 'HCLS1', 'TRIM23', 'DHFR',
       'MSH3', 'REEP5', 'CTNNA1', 'RASGEF1C', 'IP6K3', 'LEMD2', 'AARS2',
       'CREB5', 'RABGEF1', 'YWHAG', 'NAT16', 'REPIN1', 'RBM33', 'MCPH1',
       'NUDT18', 'GRHL2', 'GNE', 'GAPVD1', 'SET', 'CACNA1B', 'PITRM1',
       'PRKCQ', 'CAMK1D', 'ZMIZ1', 'PLCE1', 'BET1L', 'TALDO1', 'TMPRSS13',
       'ATN1', 'PEX5', 'APPL2', 'HSPH1', 'ARL11', 'DNAJC3', 'TOX4',
       'SYNE2', 'IVD', 'CREBBP', 'PRKCB', 'PLCG2', 'RPL13', 'GAS8',
       'MYO1C', 'DHX33', 'DLG4', 'PRPSAP2', 'USP22', 'MAP2K3', 'LASP1',
       'KRT10', 'PIAS2', 'HDGFRP2', 'LONP1', 'EIF3G', 'JUND', 'ERCC1',
       'DMPK', 'NSFL1C', 'CDC25B', 'RBBP9', 'AHCY', 'RPN2', 'GATA5',
       'NOL12', 'EP300', 'LDOC1L', 'CERK', 'LRCH2', 'FLI1', 'SRSF6',
       'M

In [316]:
mortal_sv_tel[mortal_sv_tel["Gene"].isin(top_tel_genes_list["Gene symbol"])]
# the germline SV found in JFCF-6 can also be observed in IIICF mortal cell --> it is possibly a technical artifacts.
# how to deal with germline SVs is still a big question as there is a lot of "false positive" discoveries.

Unnamed: 0,Key,Name,Source,dbSNP,Type,Chr1,Pos1,Chr2,Pos2,Qual1,Qual2,AF1,AF2,Read1,Read2,Gene,Consequence,Impact,Ref1,Alt1,Ref2,Alt2,exon_count,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,cytoBand,ExAC_ALL,ExAC_AFR,ExAC_AMR,ExAC_EAS,ExAC_FIN,ExAC_NFE,ExAC_OTH,ExAC_SAS,avsnp147,SIFT_score,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_pred,Polyphen2_HVAR_score,Polyphen2_HVAR_pred,LRT_score,LRT_pred,MutationTaster_score,MutationTaster_pred,MutationAssessor_score,MutationAssessor_pred,FATHMM_score,FATHMM_pred,PROVEAN_score,PROVEAN_pred,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,fathmm-MKL_coding_pred,MetaSVM_score,MetaSVM_pred,MetaLR_score,MetaLR_pred,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds
2330,SV_GRIDSS_HIGH_SP100_Mortal_chr2_231364227_231...,JFCF_6,GRIDSS,False,Translocations,chr2,231364227-231364228,chrX,54937256-54937257,NotSplit:152.64;Split:0,NotSplit:152.64;Split:0,,,NotSplit:9;Split:0,NotSplit:9;Split:0,SP100,transcript_ablation,HIGH,T,]CHRX:54937257]CTATATATTTATTTCCTTTGGCTATGAATTC...,T,TCTATATATTTATTTCCTTTGGCTATGAATTCCCCCACTAGAATGG...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3053,SV_GRIDSS_HIGH_SP100_Mortal_chr2_231364227_231...,JFCF_6,GRIDSS,False,Translocations,chr2,231364227-231364228,chrX,54937256-54937257,NotSplit:118.08;Split:0,NotSplit:118.08;Split:0,,,NotSplit:7;Split:0,NotSplit:7;Split:0,SP100,transcript_ablation,HIGH,T,]CHRX:54937257]CTATATATTTATTTCCTTTGGCTATGAATTC...,T,TCTATATATTTATTTCCTTTGGCTATGAATTCCCCCACTAGAATGG...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6399,SV_GRIDSS_HIGH_SP100_Mortal_chr2_231364227_231...,JFCF_6,GRIDSS,False,Translocations,chr2,231364227-231364228,chrX,54937256-54937257,NotSplit:155.06;Split:0,NotSplit:155.06;Split:0,,,NotSplit:9;Split:0,NotSplit:9;Split:0,SP100,transcript_ablation,HIGH,T,]CHRX:54937257]CTATATATTTATTTCCTTTGGCTATGAATTC...,T,TCTATATATTTATTTCCTTTGGCTATGAATTCCCCCACTAGAATGG...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9510,SV_GRIDSS_HIGH_SP100_Mortal_chr2_231364227_231...,JFCF_6,GRIDSS,False,Translocations,chr2,231364227-231364228,chrX,54937256-54937257,NotSplit:172.38;Split:0,NotSplit:172.38;Split:0,,,NotSplit:10;Split:0,NotSplit:10;Split:0,SP100,transcript_ablation,HIGH,T,]CHRX:54937257]CTATATATTTATTTCCTTTGGCTATGAATTC...,T,TCTATATATTTATTTCCTTTGGCTATGAATTCCCCCACTAGAATGG...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11108,SV_GRIDSS_HIGH_SP100_Mortal_chr2_231364227_231...,JFCF_6,GRIDSS,False,Translocations,chr2,231364227-231364228,chrX,54937256-54937257,NotSplit:100.1;Split:0,NotSplit:100.1;Split:0,,,NotSplit:6;Split:0,NotSplit:6;Split:0,SP100,transcript_ablation,HIGH,T,]CHRX:54937257]CTATATATTTATTTCCTTTGGCTATGAATTC...,T,TCTATATATTTATTTCCTTTGGCTATGAATTCCCCCACTAGAATGG...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24371,SV_GRIDSS_HIGH_MCPH1_Mortal_chr8_6304520_63045...,JFCF_6,GRIDSS,False,Deletion,chr8,6304520-6304530,chr8,6304535-6304545,NotSplit:0;Split:0,NotSplit:0;Split:0,,,NotSplit:0;Split:0,NotSplit:0;Split:0,MCPH1,gene_fusion,HIGH,G,G[CHR8:6304540[,C,]CHR8:6304525]C,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24372,SV_GRIDSS_HIGH_MCPH1_Mortal_chr8_6304520_63045...,JFCF_6,GRIDSS,False,Deletion,chr8,6304520-6304530,chr8,6304535-6304545,NotSplit:0;Split:0,NotSplit:0;Split:0,,,NotSplit:0;Split:0,NotSplit:0;Split:0,MCPH1,gene_fusion&frameshift_variant,HIGH,G,G[CHR8:6304540[,C,]CHR8:6304525]C,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24823,SV_GRIDSS_HIGH_GRHL2_Mortal_chr8_102681481_102...,JFCF_6,GRIDSS,False,Insertion,chr8,102681481-102681482,chr8,102681482-102681483,NotSplit:0;Split:0,NotSplit:0;Split:0,,,NotSplit:0;Split:0,NotSplit:0;Split:0,GRHL2,gene_fusion&frameshift_variant,HIGH,C,CTTTTTTTTTTTTT[CHR8:102681483[,T,]CHR8:102681482]TTTTTTTTTTTTTT,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
36970,SV_GRIDSS_HIGH_ERCC1_Mortal_chr19_45913822_459...,JFCF_6,GRIDSS,False,Insertion,chr19,45913822-45913823,chr19,45913823-45913824,NotSplit:0;Split:0,NotSplit:0;Split:0,,,NotSplit:0;Split:0,NotSplit:0;Split:0,ERCC1,gene_fusion&frameshift_variant,HIGH,T,TTGTGTGTGTGTGTG[CHR19:45913824[,T,]CHR19:45913823]TGTGTGTGTGTGTGT,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
36971,SV_GRIDSS_HIGH_ERCC1_Mortal_chr19_45913822_459...,JFCF_6,GRIDSS,False,Insertion,chr19,45913822-45913823,chr19,45913823-45913824,NotSplit:0;Split:0,NotSplit:0;Split:0,,,NotSplit:0;Split:0,NotSplit:0;Split:0,ERCC1,gene_fusion,HIGH,T,TTGTGTGTGTGTGTG[CHR19:45913824[,T,]CHR19:45913823]TGTGTGTGTGTGTGT,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [323]:
mg_df[(mg_df['Name']=="JFCF_6")&(mg_df['Gene']=="SP100")]["Key"].unique()

array(['SV_GRIDSS_HIGH_SP100_Mortal_chr2_231364227_231364228_chrX_54937256_54937257'],
      dtype=object)