In [22]:
import os
import pandas as pd

def parse_vep_vcf(vcf_file, log_file="vcf_analysis_log.txt"):
    variants = []

    with open(vcf_file, "r") as f:
        for line in f:
            if line.startswith("#"):
                continue  
            cols = line.strip().split("\t")
            if len(cols) < 8:
                continue 

            chrom, pos, snp_id, ref, alt, qual, filt, info = cols[:8]

           
            if ":" in pos:
                chrom, pos = pos.split(":")
            elif ":" in chrom:
                chrom, pos = chrom.split(":")
            

            try:
                pos = int(pos)
            except ValueError:
                continue 

            variants.append({
                "Chrom": chrom,
                "Pos": pos,
                "SNP_ID": snp_id,
                "Ref": ref,
                "Alt": alt,
                "INFO": info
            })


    df = pd.DataFrame(variants)

    
    def extract_info_field(info, key):
        for item in info.split(";"):
            if item.startswith(key + "="):
                return item.split("=", 1)[1]
        return None

    df["Gene"] = df["INFO"].apply(lambda x: extract_info_field(x, "GENE") or "")
    df["ClinSig"] = df["INFO"].apply(lambda x: extract_info_field(x, "CLNSIG") or "")
    df["Trait"] = df["INFO"].apply(lambda x: extract_info_field(x, "CLNDN") or "")

  
    total_variants = len(df)
    unique_genes = set(g for g in df["Gene"].tolist() if g)
    unique_traits = set(t for t in df["Trait"].tolist() if t)

    
    pathogenic_variants = df[df["ClinSig"].str.contains("pathogenic", case=False, na=False)]

    
    per_chrom = (
        df.groupby("Chrom")
        .agg(
            total_variants=("Pos", "count"),
            snp_ids=("SNP_ID", lambda x: (x != ".").sum()),
            no_snp_ids=("SNP_ID", lambda x: (x == ".").sum()),
        )
        .reset_index()
    )

    
    with open(log_file, "w") as log:
        log.write(f"File: {vcf_file}\n")
        log.write(f"Total Variants: {total_variants}\n")
        log.write(f"Unique Genes: {len(unique_genes)}\n")
        log.write(f"Unique Traits: {len(unique_traits)}\n\n")

        log.write("=== Per Chromosome Summary ===\n")
        log.write(per_chrom.to_string(index=False))
        log.write("\n\n")

        log.write("=== Pathogenic Variants (first 20 shown) ===\n")
        log.write(pathogenic_variants.head(20).to_string(index=False))
        log.write("\n")

    return {
        "total_variants": total_variants,
        "unique_genes": unique_genes,
        "unique_traits": unique_traits,
        "pathogenic_variants": pathogenic_variants,
        "per_chrom": per_chrom,
    }


In [24]:
results = parse_vep_vcf(r"C:\Users\other user\Downloads\test1_data_annotated.vcf")

print("Total Variants:", results["total_variants"])
print("Unique Genes:", len(results["unique_genes"]))
print("Unique Traits:", len(results["unique_traits"]))

display(results["per_chrom"].head())

display(results["pathogenic_variants"].head())


Total Variants: 29248
Unique Genes: 0
Unique Traits: 0


Unnamed: 0,Chrom,total_variants,snp_ids,no_snp_ids
0,1,2986,2986,0
1,10,1301,1301,0
2,11,1994,1994,0
3,12,1464,1464,0
4,13,525,525,0


Unnamed: 0,Chrom,Pos,SNP_ID,Ref,Alt,INFO,Gene,ClinSig,Trait


In [25]:
results = parse_vep_vcf(r"C:\Users\other user\Downloads\test2_data_annotated.vcf")

print("Total Variants:", results["total_variants"])
print("Unique Genes:", len(results["unique_genes"]))
print("Unique Traits:", len(results["unique_traits"]))

display(results["per_chrom"].head())

display(results["pathogenic_variants"].head())


Total Variants: 644831
Unique Genes: 0
Unique Traits: 0


Unnamed: 0,Chrom,total_variants,snp_ids,no_snp_ids
0,1,49146,49146,0
1,10,30649,30649,0
2,11,31283,31283,0
3,12,29375,29375,0
4,13,23244,23244,0


Unnamed: 0,Chrom,Pos,SNP_ID,Ref,Alt,INFO,Gene,ClinSig,Trait


In [26]:
import os
print(os.getcwd())


C:\Users\other user
