# Merging Functional Annotations into the CADD Chr22 File using Python

Set Working Directory and Import Files / Pandas:

In [1]:
# Check Working Directory.

import os
import sys
print("wd"
     , os.getcwd())

wd /Users/patrickhallaert


In [2]:
# Set Working Directory to Hard Drive, which contains all the needed files.

os.chdir('/Volumes/HZU/CADD/hg19')
print("wd"
     , os.getcwd())

wd /Volumes/HZU/CADD/hg19


In [3]:
# Import Pandas and NumPy

import pandas as pd
import numpy as np

Import and View Chr22 CADD File

In [4]:
# We run the separator as ‘\t’ because in this file the tab character separates the fields.

ch22 = pd.read_csv("chr22F1-43m.txt",sep='\t', dtype = 'str')

In [5]:
# We can quickly view the data to make sure there are no issues and everything looks normal.

ch22.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,"## CADD GRCh37-v1.6 (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved."
#Chrom,Pos,Ref,Alt
22,16050001,G,A
22,16050001,G,C
22,16050001,G,T
22,16050002,A,C


In [6]:
print(ch22.columns.tolist())

# This first row will cause a KeyError as left_join will be unable to recognize the correct headers.

['## CADD GRCh37-v1.6 (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved.']


In [7]:
# Thus, we remove the "Title" Row (through the command line) to Prevent a future KeyError while Left_Joining CADD chr22 with a functional annotation dataset.

ch22nh = pd.read_csv("/Volumes/HZU/CADD/hg19/chr22F1-43mnh.txt",sep='\t', dtype = 'str')# iterator=True, chunksize=1000)

Import and View ClinVar Functional Annotations

In [8]:
# Now, we can import the ClinVar data.

clinvar = pd.read_csv('clinvar_20220528.txt', comment='#', sep='\t' , dtype='str')

# Additional bits (comment='#', sep='\t') needed to make the vcf file "readable" to Pandas left_join.

In [9]:
# Let's take a look:

clinvar.head()

Unnamed: 0,1,861332,1019397,G,A,.,..1,"ALLELEID=1003021;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.10:g.861332G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=1640863258"
0,1,861336,1543320,C,T,.,.,ALLELEID=1632777;CLNDISDB=MedGen:CN517202;CLND...
1,1,861349,1648427,C,T,.,.,ALLELEID=1600580;CLNDISDB=MedGen:CN517202;CLND...
2,1,861356,1362713,T,C,.,.,ALLELEID=1396033;CLNDISDB=MedGen:CN517202;CLND...
3,1,861366,1568423,C,T,.,.,ALLELEID=1570515;CLNDISDB=MedGen:CN517202;CLND...
4,1,861383,1365270,C,T,.,.,ALLELEID=1502313;CLNDISDB=MedGen:CN517202;CLND...


In [10]:
# Let's add headers.

clinvar.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]

In [11]:
# Let's take another look:

clinvar.head()

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,1,861336,1543320,C,T,.,.,ALLELEID=1632777;CLNDISDB=MedGen:CN517202;CLND...
1,1,861349,1648427,C,T,.,.,ALLELEID=1600580;CLNDISDB=MedGen:CN517202;CLND...
2,1,861356,1362713,T,C,.,.,ALLELEID=1396033;CLNDISDB=MedGen:CN517202;CLND...
3,1,861366,1568423,C,T,.,.,ALLELEID=1570515;CLNDISDB=MedGen:CN517202;CLND...
4,1,861383,1365270,C,T,.,.,ALLELEID=1502313;CLNDISDB=MedGen:CN517202;CLND...


Import and View Eigen Functional Annotations

In [12]:
# Now, we repeat the process for the Eigen dataset.

eigen = pd.read_csv("/Volumes/HZU/everyheaders_hg19_Eigen22.txt",sep='\t', on_bad_lines='skip', dtype='str')
eigen.head()

Unnamed: 0,#chr,position,position.1,ref,alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,...,PhyloPla,PhyloVer,PhastPri,PhastPla,PhastVe,Consequence,Eigen-raw,Eigen-phred,Eigen-PC-raw,Eigen-PC-phred
0,22,16157306,16157306,T,A,.,.,.,.,0.511,...,0.476,0.153,0.015,0.019,0.043,"intron_variant,non_coding_transcript_variant",-0.723507420492939,0.7735336,-0.95435275448397,0.5475722
1,22,16157306,16157306,T,C,.,.,.,.,0.511,...,0.476,0.153,0.015,0.019,0.043,"intron_variant,non_coding_transcript_variant",-0.723507420492939,0.7735336,-0.95435275448397,0.5475722
2,22,16157306,16157306,T,G,.,.,.,.,0.511,...,0.476,0.153,0.015,0.019,0.043,"intron_variant,non_coding_transcript_variant",-0.723507420492939,0.7735336,-0.95435275448397,0.5475722
3,22,16157307,16157307,C,A,.,.,.,.,0.511,...,0.569,0.68,0.016,0.022,0.049,"intron_variant,non_coding_transcript_variant",-0.690256467806581,0.829126,-0.919775085203436,0.5935479
4,22,16157307,16157307,C,G,.,.,.,.,0.511,...,0.569,0.68,0.016,0.022,0.049,"intron_variant,non_coding_transcript_variant",-0.690256467806581,0.829126,-0.919775085203436,0.5935479


Import and View dbnsfp33a Functional Annotations

In [13]:
# Finally, the dbnsfp33a annotations:
    
dbnsfp33a = pd.read_csv("/Volumes/HZU/humandb/everyheader_hg19_dbnsfp33a22.txt",sep='\t', on_bad_lines='skip' , dtype='str')
dbnsfp33a.head()

Unnamed: 0,#chr,start,end,ref,alt,SIFT_score,SIFT_converted_rankscore,SIFT_pred,Polyphen2_HDIV_score,Polyphen2_HDIV_rankscore,...,phyloP20way_mammalian_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons20way_mammalian,phastCons20way_mammalian_rankscore,SiPhy_29way_logOdds,SiPhy_29way_logOdds_rankscore,Interpro_domain,GTEx_V6_gene,GTEx_V6_tissue
0,22,16287549,16287549,G,A,.,.,.,.,.,...,0.247,0.001,0.137,0.0,0.016,.,.,.,.,.
1,22,16287549,16287549,G,C,.,.,.,.,.,...,0.247,0.001,0.137,0.0,0.016,.,.,.,.,.
2,22,16287549,16287549,G,T,.,.,.,.,.,...,0.247,0.001,0.137,0.0,0.016,.,.,.,.,.
3,22,16287550,16287550,C,A,.,.,.,.,.,...,0.001,0.0,0.063,0.0,0.016,.,.,.,.,.
4,22,16287550,16287550,C,G,.,.,.,.,.,...,0.001,0.0,0.063,0.0,0.016,.,.,.,.,.


Merge the CADD Ch22 File with Eigen First to create a "Ch22Eigen" File

In [14]:
ch22eigen = pd.merge(ch22nh, eigen, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'position', 'ref', 'alt'], how="left")
ch22eigen = ch22eigen.drop(columns="position.1")
ch22eigen = ch22eigen.drop(columns="ref")
ch22eigen = ch22eigen.drop(columns="alt")
ch22eigen = ch22eigen.drop(columns="position")
ch22eigen = ch22eigen.drop(columns="#chr")
ch22eigen

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,PhyloPla,PhyloVer,PhastPri,PhastPla,PhastVe,Consequence,Eigen-raw,Eigen-phred,Eigen-PC-raw,Eigen-PC-phred
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


Now incorporate the dbnsfp33a data into this "Ch22Eigen" data:

In [15]:
ch22_eigen_dbn = pd.merge(ch22eigen, dbnsfp33a, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,phyloP20way_mammalian_rankscore,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons20way_mammalian,phastCons20way_mammalian_rankscore,SiPhy_29way_logOdds,SiPhy_29way_logOdds_rankscore,Interpro_domain,GTEx_V6_gene,GTEx_V6_tissue
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


Merge in the ClinVar data to the file already containing CADD, Eigen, and dbnsfp33a data.

In [16]:
import gc
gc.collect()

0

In [17]:
ch22_eigen_dbn_clinvar = pd.merge(ch22_eigen_dbn, clinvar, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#CHROM', 'POS', 'REF', 'ALT'], how="left")
ch22_eigen_dbn_clinvar
ch22_eigen_dbn_clinvar = ch22_eigen_dbn_clinvar.drop(columns="#CHROM")
ch22_eigen_dbn_clinvar = ch22_eigen_dbn_clinvar.drop(columns="POS")
ch22_eigen_dbn_clinvar = ch22_eigen_dbn_clinvar.drop(columns="ID")
ch22_eigen_dbn_clinvar = ch22_eigen_dbn_clinvar.drop(columns="REF")
ch22_eigen_dbn_clinvar = ch22_eigen_dbn_clinvar.drop(columns="ALT")
ch22_eigen_dbn_clinvar = ch22_eigen_dbn_clinvar.drop(columns="QUAL")
ch22_eigen_dbn_clinvar = ch22_eigen_dbn_clinvar.drop(columns="FILTER")
ch22_eigen_dbn_clinvar = ch22_eigen_dbn_clinvar.drop(columns="INFO")

In [18]:
c_fathmm = pd.read_csv("/Volumes/HZU/humandb/hg19_fathmm_xf_codingch22headers.txt",sep='\t', dtype = 'str')
c_fathmm = c_fathmm.drop(columns="position")
c_fathmm

Unnamed: 0,#Chrom,Pos,Ref,Alt,FATHMM_XF_coding
0,22,16258189,G,A,0.006646
1,22,16258189,G,C,0.066183
2,22,16258189,G,T,0.078170
3,22,16258190,A,C,0.039113
4,22,16258190,A,G,0.040414
...,...,...,...,...,...
2232697,22,51220721,A,G,0.023020
2232698,22,51220721,A,T,0.034400
2232699,22,51220722,T,A,0.034870
2232700,22,51220722,T,C,0.033790


In [19]:
ch22_eigen_dbn_clinvar_cfathmm = pd.merge(ch22_eigen_dbn_clinvar, c_fathmm, on =['#Chrom', 'Pos', 'Ref', 'Alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons20way_mammalian,phastCons20way_mammalian_rankscore,SiPhy_29way_logOdds,SiPhy_29way_logOdds_rankscore,Interpro_domain,GTEx_V6_gene,GTEx_V6_tissue,FATHMM_XF_coding
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [20]:
exomegnomad = pd.read_csv("/Volumes/HZU/humandb/headers_hg19_gnomad211_exomech22.txt",sep='\t', dtype = 'str')
exomegnomad.head()

Unnamed: 0,#chr,start,end,ref,alt,AF,AF_popmax,AF_male,AF_female,AF_raw,...,AF_amr,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax
0,22,16157263,16157263,C,T,0.0054,0.0132,0.0043,0.0068,0.0043,...,0.0,0.0081,0.0,0.0,0,0.0294,0.0132,0.0132,0.0132,0.0172
1,22,16157264,16157264,G,A,0.0703,0.1233,0.0721,0.068,0.0242,...,0.1233,0.0,0.1042,0.0714,0,0.1154,0.1244,0.1293,0.1233,0.1272
2,22,16157277,16157277,G,A,0.0014,0.0035,0.0019,0.0008,0.0006,...,0.0035,0.0026,0.0,0.0,0,0.0,0.0035,0.0037,0.0035,0.0024
3,22,16157293,16157293,G,C,0.0014,0.0041,0.0017,0.001,0.0009,...,0.0041,0.0,0.0008,0.0,0,0.0,0.0042,0.0045,0.0041,0.0031
4,22,16157302,16157302,T,C,0.0,.,0.0,0.0,2.028e-05,...,0.0,0.0,0.0,0.0,0,0.0,.,.,.,.


In [21]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad = pd.merge(ch22_eigen_dbn_clinvar_cfathmm, exomegnomad, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad


Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,AF_amr,AF_eas,AF_nfe,AF_fin,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


Splice the non-coding Fathmm file and isolate the chr22 section.


In [22]:
gc.collect()

0

In [23]:
abraom = pd.read_csv("/Volumes/HZU/humandb/headers_hg19_abraom_ch22.txt",sep='\t', dtype = 'str')
abraom

Unnamed: 0,#chr,start,end,ref,alt,abraom_freq,abraom_filter,abraom_cegh_filter
0,22,16256078,16256078,G,A,0.018072,LowQual,FAB
1,22,16256352,16256352,T,C,0.404762,VQSRTrancheSNP99.90to100.00,WK-LowCall
2,22,16256430,16256430,A,G,0.325581,VQSRTrancheSNP99.90to100.00,WK-LowCall
3,22,16256484,16256484,T,C,0.078431,VQSRTrancheSNP99.90to100.00,FAB
4,22,16256512,16256512,T,C,0.391081,VQSRTrancheSNP99.00to99.90,WK-LowCall
...,...,...,...,...,...,...,...,...
60421,22,51237712,51237712,G,A,0.131250,VQSRTrancheSNP99.00to99.90,FAB
60422,22,51237766,51237766,T,C,0.016393,VQSRTrancheSNP99.00to99.90,FAB
60423,22,51238130,51238130,G,A,0.272727,VQSRTrancheSNP99.00to99.90,FDP
60424,22,51238249,51238249,A,C,0.063492,VQSRTrancheSNP99.00to99.90,FDP


In [24]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_exomegnomad, abraom, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,non_cancer_AF_popmax,controls_AF_popmax,#chr,start,end,ref,alt,abraom_freq,abraom_filter,abraom_cegh_filter
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [25]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom.drop(columns="alt_x")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom.drop(columns="start_x")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom.drop(columns="ref_x")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom.drop(columns="#chr")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom.drop(columns="alt")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom.drop(columns="start")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom.drop(columns="ref")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom.drop(columns="#chr_x")

In [26]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,AF_asj,AF_oth,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax,end,abraom_freq,abraom_filter,abraom_cegh_filter
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [27]:
gc.collect()
AFR = pd.read_csv('/Volumes/HZU/humandb/hg19_AFR.sites.2015_08_ch22.txt', comment='#', sep='\t' , dtype='str')
ALL = pd.read_csv('/Volumes/HZU/humandb/hg19_ALL.sites.2015_08_ch22.txt', comment='#', sep='\t' , dtype='str')
AMR = pd.read_csv('/Volumes/HZU/humandb/hg19_AMR.sites.2015_08_ch22.txt', comment='#', sep='\t' , dtype='str')
AMR = pd.read_csv('/Volumes/HZU/humandb/hg19_AMR.sites.2015_08_ch22.txt', comment='#', sep='\t' , dtype='str')
EAS = pd.read_csv('/Volumes/HZU/humandb/hg19_EAS.sites.2015_08_ch22.txt', comment='#', sep='\t' , dtype='str')
AFR.columns = ["#chr", "start", "ref", "alt", "AFR_exome_allele_frequency", "rsID"]
ALL.columns = ["#chr", "start", "ref", "alt", "ALL_exome_allele_frequency", "rsID"]
AMR.columns = ["#chr", "start", "ref", "alt", "AMR_exome_allele_frequency", "rsID"]
EAS.columns = ["#chr", "start", "ref", "alt", "EAS_exome_allele_frequency", "rsID"]

In [28]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom, ALL, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,end,abraom_freq,abraom_filter,abraom_cegh_filter,#chr,start,ref,alt,ALL_exome_allele_frequency,rsID
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [29]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL.drop(columns="#chr")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL.drop(columns="alt")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL.drop(columns="start")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL.drop(columns="ref")

In [30]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL.drop(columns="rsID")


In [31]:
gc.collect()
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL, AFR, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,abraom_freq,abraom_filter,abraom_cegh_filter,ALL_exome_allele_frequency,#chr,start,ref,alt,AFR_exome_allele_frequency,rsID
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [32]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR.drop(columns=['#chr', "start", "ref", "alt", "rsID"])
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,non_topmed_AF_popmax,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax,end,abraom_freq,abraom_filter,abraom_cegh_filter,ALL_exome_allele_frequency,AFR_exome_allele_frequency
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [33]:
gc.collect()
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR, EAS, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,abraom_filter,abraom_cegh_filter,ALL_exome_allele_frequency,AFR_exome_allele_frequency,#chr,start,ref,alt,EAS_exome_allele_frequency,rsID
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [34]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS.drop(columns=['#chr', "start", "ref", "alt", "rsID"])
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,non_neuro_AF_popmax,non_cancer_AF_popmax,controls_AF_popmax,end,abraom_freq,abraom_filter,abraom_cegh_filter,ALL_exome_allele_frequency,AFR_exome_allele_frequency,EAS_exome_allele_frequency
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [35]:
gc.collect()

0

In [36]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR = pd.merge(ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS, AMR, left_on=['#Chrom', 'Pos', 'Ref', 'Alt'], right_on=['#chr', 'start', 'ref', 'alt'], how="left")

In [37]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR

Unnamed: 0,#Chrom,Pos,Ref,Alt,SIFT,PolyPhenDIV,PolyPhenVar,MA,GERP_NR,GERP_RS,...,abraom_cegh_filter,ALL_exome_allele_frequency,AFR_exome_allele_frequency,EAS_exome_allele_frequency,#chr,start,ref,alt,AMR_exome_allele_frequency,rsID
0,22,16050001,G,A,,,,,,,...,,,,,,,,,,
1,22,16050001,G,C,,,,,,,...,,,,,,,,,,
2,22,16050001,G,T,,,,,,,...,,,,,,,,,,
3,22,16050002,A,C,,,,,,,...,,,,,,,,,,
4,22,16050002,A,G,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2999993,22,16929617,A,T,,,,,,,...,,,,,,,,,,
2999994,22,16929618,T,A,,,,,,,...,,,,,,,,,,
2999995,22,16929618,T,C,,,,,,,...,,,,,,,,,,
2999996,22,16929618,T,G,,,,,,,...,,,,,,,,,,


In [None]:
ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR = ch22_eigen_dbn_clinvar_cfathmm_exomegnomad_abraom_ALL_AFR_EAS_AMR.drop(columns=['#chr', "start", "ref", "alt", "rsID"])