In [2]:
import pandas as pd
import gzip

def clinvar_vcf_to_pd(vcf_path):
    # list where we will store dictionary params of each variant
    variants_params = list()
    with gzip.open(vcf_path, "rt") as file:
        for line in file:
            # dictinary where we will store each parameter of the variant
            if line.startswith("#"):
                # descriptor lines not interested in
                continue
            fields = line.split("\t")
            # dictinary where we will store each parameter of the variant
            # obtaining parameters from each variant 
            chrom = fields[0]
            pos = fields[1]
            id = fields[2]
            ref = fields[3]
            alt = fields[4]
            qual = fields[5]
            filter = fields[6]
            info = fields[7]
            dict_params = {
                "Chrom" : chrom,
                "Pos" : pos,
                "Id" : id,
                "Ref" : ref,
                "Alt" : alt,
                "Qual" : qual,
                "Filter" : filter,

            }

            # in info we have different parameters 
            clnv_params = info.split(";")

            for clnv_param in clnv_params:
                key_value = clnv_param.split("=")
                key = key_value[0]
                value = key_value[1]
                # it's a comma seperated list of molecular consequences
                if key == "MC":
                    if "," in value:
                        mol_conseqs_ids = list()
                        mol_conseqs = value.split(",")
                        # print(mol_conseqs)
                        for mol_conseq in mol_conseqs:
                            # taking the string id to be converted into factors 
                            # print(mol_conseq)
                            mol_conseq_id = mol_conseq.split("|")[1]
                            mol_conseqs_ids.append(mol_conseq_id)
                        value = ",".join(mol_conseqs_ids)
                    else:
                        value = value.split("|")[1]


                dict_params[key] = value
            

            variants_params.append(dict_params)
    return(variants_params)


variants_params = clinvar_vcf_to_pd("/home/ocanal/ANN_DIR/clinvar/hg38/clinvar_20231104.vcf.gz")
df = pd.DataFrame(variants_params)

In [5]:
df

Unnamed: 0,Chrom,Pos,Id,Ref,Alt,Qual,Filter,ALLELEID,CLNDISDB,CLNDN,...,RS,AF_EXAC,AF_ESP,CLNSIGCONF,AF_TGP,CLNVI,CLNDISDBINCL,CLNDNINCL,CLNSIGINCL,DBVARID
0,1,69134,2205837,A,G,.,.,2193183,"MeSH:D030342,MedGen:C0950123",Inborn_genetic_diseases,...,,,,,,,,,,
1,1,69581,2252161,C,G,.,.,2238986,"MeSH:D030342,MedGen:C0950123",Inborn_genetic_diseases,...,,,,,,,,,,
2,1,69682,2396347,G,A,.,.,2386655,"MeSH:D030342,MedGen:C0950123",Inborn_genetic_diseases,...,,,,,,,,,,
3,1,69769,2288999,T,C,.,.,2278803,"MeSH:D030342,MedGen:C0950123",Inborn_genetic_diseases,...,,,,,,,,,,
4,1,69995,2351346,G,C,.,.,2333177,"MeSH:D030342,MedGen:C0950123",Inborn_genetic_diseases,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2300616,NT_187693.1,273806,2219599,G,A,.,.,2206917,"MeSH:D030342,MedGen:C0950123",Inborn_genetic_diseases,...,,,,,,,,,,
2300617,NT_187693.1,273866,2237818,A,C,.,.,2232003,"MeSH:D030342,MedGen:C0950123",Inborn_genetic_diseases,...,,,,,,,,,,
2300618,NT_187693.1,274366,2206666,G,C,.,.,2200058,"MeSH:D030342,MedGen:C0950123",Inborn_genetic_diseases,...,,,,,,,,,,
2300619,NT_187693.1,275068,2241971,T,C,.,.,2226217,"MeSH:D030342,MedGen:C0950123",Inborn_genetic_diseases,...,,,,,,,,,,


In [3]:
df["CLNSIG"].value_counts()


CLNSIG
Uncertain_significance                                            1119929
Likely_benign                                                      606654
Benign                                                             193415
Pathogenic                                                         135246
Conflicting_interpretations_of_pathogenicity                       103898
                                                                   ...   
Conflicting_interpretations_of_pathogenicity|association|other          1
Uncertain_risk_allele|protective                                        1
Affects|association                                                     1
other|risk_factor                                                       1
Likely_benign|risk_factor                                               1
Name: count, Length: 88, dtype: int64

In [4]:
df["CLNSIG"].unique()

array(['Likely_benign', 'Uncertain_significance', 'Benign',
       'Conflicting_interpretations_of_pathogenicity', 'Pathogenic',
       'Likely_pathogenic', 'Benign/Likely_benign', 'not_provided',
       'Pathogenic/Likely_pathogenic', nan, 'risk_factor', 'Affects',
       'association', 'Benign|other',
       'Conflicting_interpretations_of_pathogenicity|other',
       'drug_response',
       'Conflicting_interpretations_of_pathogenicity|association',
       'Uncertain_risk_allele', 'other',
       'Uncertain_significance|risk_factor',
       'Likely_pathogenic|risk_factor', 'Likely_benign|association',
       'Likely_risk_allele', 'Pathogenic/Likely_pathogenic|other',
       'Pathogenic|other',
       'Pathogenic/Likely_pathogenic/Pathogenic,_low_penetrance',
       'Pathogenic/Likely_pathogenic/Pathogenic,_low_penetrance|other',
       'protective', 'Pathogenic|risk_factor',
       'Pathogenic/Likely_pathogenic|risk_factor',
       'Benign/Likely_benign|risk_factor',
       'Uncerta