# Explore SNP VCF Files

Parse VCF files of SNPs for WBA and T1_1A and compares the two.

In [55]:
import allel
import pandas

HEALTHY_SNP_PATH = "datasets/rarekidneycancer_patient_0/F18FTSUSAT0015_HUMaasR/WBA/result_variation/snp/WBA.snp.vcf"
TUMOR_SNP_PATH = "datasets/rarekidneycancer_patient_0/F18FTSUSAT0015_HUMaasR/T1_1A/result_variation/snp/T1_1A.snp.vcf"

SOMATIC_VCF = "datasets/somatic/results/variants/somatic.snvs.snpeff.vcf"

# Helper Functions

In [84]:
def load_vcf(path, include_filter=False):
    # Load vcf
    callset = allel.read_vcf(path)

    mapping = {
        "key": list(map(lambda x: "{}_{}".format(*x),
                   zip(callset['variants/CHROM'], callset['variants/POS']))),
        "CHROM": callset['variants/CHROM'],
        "POS": callset['variants/POS'],
        "REF": callset['variants/REF'],
        "ALT": list(map(lambda x: "".join(x), callset['variants/ALT'])),
    }
    if include_filter:
        mapping["FILTER_PASS"] = callset['variants/FILTER_PASS']

    print(mapping.keys())
    snp = pandas.DataFrame(mapping)
    snp = snp.set_index('key')
    return snp, callset

# Healthy VCF

In [43]:
healthy_snp = load_vcf(HEALTHY_SNP_PATH)
healthy_snp

Unnamed: 0_level_0,ALT,CHROM,POS,REF
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chrM_410,T,chrM,410,A
chrM_2354,T,chrM,2354,C
chrM_3916,A,chrM,3916,G
chrM_4728,G,chrM,4728,A
chrM_5581,T,chrM,5581,C
chrM_7029,C,chrM,7029,T
chrM_8702,A,chrM,8702,G
chrM_9378,A,chrM,9378,G
chrM_9381,A,chrM,9381,G
chrM_9541,T,chrM,9541,C


# Tumor VCF

In [44]:
tumor_snp = load_vcf(TUMOR_SNP_PATH)
tumor_snp

Unnamed: 0_level_0,ALT,CHROM,POS,REF
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
chrM_150,C,chrM,150,T
chrM_195,T,chrM,195,C
chrM_204,C,chrM,204,T
chrM_239,C,chrM,239,T
chrM_2354,T,chrM,2354,C
chrM_2485,T,chrM,2485,C
chrM_2708,A,chrM,2708,G
chrM_3916,A,chrM,3916,G
chrM_4728,G,chrM,4728,A
chrM_5581,T,chrM,5581,C


# Somatic Variants

In [85]:
somatic, callset = load_vcf(SOMATIC_VCF, include_filter=True)
# callset.keys()

dict_keys(['key', 'CHROM', 'POS', 'REF', 'ALT', 'FILTER_PASS'])


In [87]:
somatic

Unnamed: 0_level_0,ALT,CHROM,FILTER_PASS,POS,REF
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chr10_65979,C,chr10,False,65979,T
chr10_66274,A,chr10,False,66274,G
chr10_66397,T,chr10,False,66397,C
chr10_66481,T,chr10,False,66481,C
chr10_66830,C,chr10,False,66830,G
chr10_66877,C,chr10,False,66877,T
chr10_66880,C,chr10,False,66880,T
chr10_67377,A,chr10,False,67377,C
chr10_67396,T,chr10,False,67396,G
chr10_67402,A,chr10,False,67402,C
