# Variants Filtering, Analysis

This notebook will introduce how variants are filtered and analyzed.

## Introduction

## Preparation

### Import Python Modules

In [1]:
import re, glob, sys, os, subprocess, getopt, time
import matplotlib.pyplot as plt

### Functions

In [2]:
def mkdir(*folders):
    for folder in folders:
        subprocessRun('Make folder',folder ,'mkdir -p {0}'.format(folder))

In [3]:
def cp(file, folder):
    subprocessRun('Copy files',file +" > "+ folder,'cp {0} {1}'.format(file,folder))

In [4]:
def filePath2ID(file):
    searchObj = re.search('.*\/([^\.\n\t\r\f]+)\..*',file)
    ID = searchObj.group(1)
    return(ID)

In [5]:
def subprocessRun(title,name,cmd):

    title1 = title + " start"
    print(time.asctime(time.localtime(time.time()))+" "+title1.center(30,' ').center(70,'*'))
    print(time.asctime(time.localtime(time.time()))+" "+name.center(30,' ').center(70,'*'))
    sys.stdout.flush()

    p = subprocess.Popen([cmd],shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
    p.wait()

    print("stdout".center(20,' ').center(40,'='))
    sys.stdout.flush()
    for i in p.stdout.readlines():
        print(str(i).strip('\n'))
        sys.stdout.flush()
    print("stderr".center(20,' ').center(40,'='))
    sys.stdout.flush()
    for i in p.stderr.readlines():
        print(str(i).strip('\n'))
        sys.stdout.flush()

    title2 = title + " end"
    print(time.asctime(time.localtime(time.time()))+" "+title2.center(30,' ').center(70,'*'))
    print(time.asctime(time.localtime(time.time()))+" "+name.center(30,' ').center(70,'*'))
    print("\n")
    sys.stdout.flush()

In [6]:
def specific(variants):
    IDs = list(variants.keys())
    variants_samples = {}
    for i in IDs:
        for j in variants[i]:
            if j in variants_samples.keys():
                variants_samples[j].append(i)
            else:
                variants_samples[j] = []
                variants_samples[j].append(i)

    specificVariants = {}
    allVariants = list(variants_samples.keys())
    for i in allVariants:
        if len(variants_samples[i]) == 1:
            ID = variants_samples[i][0]
            if ID in specificVariants.keys():
                specificVariants[ID].append(i)
            else:
                specificVariants[ID] = []
                specificVariants[ID].append(i)
    return(specificVariants)
def double(variants):
    IDs = list(variants.keys())
    variants_samples = {}
    for i in IDs:
        for j in variants[i]:
            if j in variants_samples.keys():
                variants_samples[j].append(i)
            else:
                variants_samples[j] = []
                variants_samples[j].append(i)

    specificVariants = {}
    n = 2
    allVariants = list(variants_samples.keys())
    for i in allVariants:
        if len(variants_samples[i]) == n:
            for j in range(n):
                
                ID = variants_samples[i][n]
                if ID in specificVariants.keys():
                    specificVariants[ID].append(i)
                else:
                    specificVariants[ID] = []
                    specificVariants[ID].append(i)
    return(specificVariants)

In [7]:
class vcf:

    file = ''
    header = []
    title = ''
    sites = []
    variants = []
    annotations = {}
    ID = ''

    def __init__(self,filePath):

        self.file = ''
        self.header = []
        self.title = ''
        self.sites = []
        self.variants = []
        self.annotations = {}
        self.ID = ''

        self.file = filePath

        inputs = open(self.file,'r')
        lines = list(inputs.readlines())
        inputs.close()

        searchObj = re.search('.*\/([^\.\n\t\r\f]+)\..*',self.file)
        self.ID = searchObj.group(1)

        for i in lines:

            i = i.strip('\n')
            if i[0] == "#" and i[1] == "#":

                self.header.append(i)

            elif i[0] == "#":

                self.title = i

            else:

                cells = re.split('\t',i)

                Chr = cells[0]
                Posi = cells[1]
                Ref = cells[3]
                Alt = cells[4]
                Qual = cells[5]
                Type = ''

                if len(Ref) == len(Alt) and len(Ref) == 1:
                    Type = 'SNP'
                elif len(Ref) > len(Alt):
                    Type = "Deletion"
                elif len(Ref) < len(Alt):
                    Type = "Insertion"

                site = Chr + "\t" + Posi
                self.sites.append(site)

                variant = Chr + "\t" + Posi + "\t" + Ref + "\t" + Alt
                self.variants.append(variant)
                self.annotations[variant] = {}
                self.annotations[variant]['Quality'] = Qual
                self.annotations[variant]['Type'] = Type

                info = re.split(';',cells[7])

                for j in info:

                    KeyValue = re.search('(\S+)=(\S+)',j)
                    if KeyValue != None:
                        Key = KeyValue.group(1)
                        Value = KeyValue.group(2)
                        self.annotations[variant][Key] = Value

                Formats = re.split(':',cells[8])
                FormatsValue = re.split(':',cells[9])

                for j in range(len(Formats)):
                    
                    Key = Formats[j]
                    Value = FormatsValue[j]

                    if Key == "AD":
                        AD = re.split(',',Value)
                        Value = (float(AD[0])+1)/(float(AD[1])+1)

                    self.annotations[variant][Key] = Value

### Project

In [8]:
project = "/data/yangyusheng/20190927_V6"
callers = ['Samtools','GATK','Deepvariant']

## Procedure

### SpecificVariants

In [9]:
variants = {}
specificVariants = {}
for i in callers:
    variants[i] = {}
    specificVariants[i] = {}
    samples = glob.glob("{0}/1_Files/4_Variants_{1}/*.Normalized.NoRefError.vcf".format(project,i))
    for j in samples:
        ID = filePath2ID(j)
        tmp = vcf(j)
        variants[i][ID] = tmp.variants
    specificVariants[i] = specific(variants[i])

In [10]:
file = open("{0}/1_Files/4_Variants_GATK_GVCF/output_SB.vcf".format(project))
lines = list(file.readlines())
file.close()

refErrorTxt = open('{0}/0_Preparation/refError.txt'.format(project),'r')
refErrorLines = list(refErrorTxt.readlines())
refErrorTxt.close()
refError = []
for i in refErrorLines:
    refError.append(i.strip('\n'))

In [11]:
variants['GATK_GVCF'] = {}

In [12]:
header = []
allVariants = {}
for i in lines:
    if i[0] == "#" and i[1] != "#":
        header = re.split('\t',i.strip('\n'))
    elif i[0] != "#" and i[1] != "#":
        info = re.split('\t',i.strip('\n'))
        tmpvariant = info[0]+"\t"+info[1]+"\t"+info[3]+"\t"+info[4]
        if ',' not in info[4] and tmpvariant not in refError:
            Chr = info[0]
            Posi = info[1]
            Ref = info[3]
            Alt = info[4]
            Qual = info[5]
            Info = info[7]
            Formats = re.split(':',info[8])
            Type = ""
            if len(Ref) == len(Alt) and len(Ref) == 1:
                Type = 'SNP'
            elif len(Ref) > len(Alt):
                Type = "Deletion"
            elif len(Ref) < len(Alt):
                Type = "Insertion"

            site = info[0] + "\t" + info[1] + "\t" + info[3] + "\t" + info[4]

            for j in range(9,len(header)):
                ID = header[j][1:]
                FormatsValue = re.split(':',info[j])
                if ID in allVariants.keys():
                    if FormatsValue[0] == "0/1" or FormatsValue[0] == "0|1":
                        allVariants[ID][site] = {}
                        allVariants[ID][site]['Quality'] = Qual
                        allVariants[ID][site]['Type'] = Type
#                     if len(Formats) != len(FormatsValue):
#                         print(i)
#                         print(Formats)
#                         print(FormatsValue)
                        for k in range(len(Formats)):

                            Key = Formats[k]
                            try:
                                Value = FormatsValue[k]
                            except IndexError:
                                Value = ""

                            if Key == "AD":
                                AD = re.split(',',Value)
                                Value = float(AD[1])
                            allVariants[ID][site][Key] = Value

                        notes = re.split(';',Info)
                        for n in notes:
                            KeyValue = re.search('(\S+)=(\S+)',n)
                            if KeyValue != None:
                                Key = KeyValue.group(1)
                                Value = KeyValue.group(2)
                                allVariants[ID][site][Key] = Value
                else:
                    if FormatsValue[0] == "0/1" or FormatsValue[0] == "0|1":
                        allVariants[ID] = {}
                        allVariants[ID][site] = {}
                        allVariants[ID][site]['Quality'] = Qual
                        allVariants[ID][site]['Type'] = Type
                        for k in range(len(Formats)):

                            Key = Formats[k]
                            try:
                                Value = FormatsValue[k]
                            except IndexError:
                                Value = ""

                            if Key == "AD":
                                AD = re.split(',',Value)
                                Value = float(AD[1])
                            allVariants[ID][site][Key] = Value

                        notes = re.split(';',Info)
                        for n in notes:
                            KeyValue = re.search('(\S+)=(\S+)',n)
                            if KeyValue != None:
                                Key = KeyValue.group(1)
                                Value = KeyValue.group(2)
                                allVariants[ID][site][Key] = Value
                    
        else:
            pass
#             print("Filtered: " + i)





In [13]:
IDs = [
 'NSK-0912-175_N702-N505',
 'NSK-0912-176_AK9183-N505',
 'NSK-0912-177_N704-N505',
 'NSK-0912-178_N705-N505',
 'NSK-0912-180_N707-N505',
 'NSK-0912-181_N708-N505',
 'NSK-0912-182_N709-N505' ]
callers = ['Samtools','GATK','GATK_GVCF','Deepvariant']
specificVariants['GATK_GVCF'] = {}
for i in IDs:
    variants['GATK_GVCF'][i] = list(allVariants[i].keys())
specificVariants['GATK_GVCF'] = specific(variants['GATK_GVCF'])

### SNP and Indel

In [14]:
SNP = {}
Indel = {}
# callers = ['GATK','GATK_GVCF']
for i in callers:
    SNP[i] = {}
    Indel[i] = {}
    for j in IDs:
        for k in specificVariants[i][j]:
            info = re.split('\t',k)
            if len(info[2]) == len(info[3]):
                if j in SNP[i].keys():
                    SNP[i][j].append(k)
                else:
                    SNP[i][j] = []
                    SNP[i][j].append(k)
            else:
                if j in Indel[i].keys():
                    Indel[i][j].append(k)
                else:
                    Indel[i][j] = []
                    Indel[i][j].append(k)

In [15]:
for j in IDs:
    print(j,end="\n")
    for i in callers:
        print(i.ljust(10),end="\t")
        total = len(SNP[i][j]) + len(Indel[i][j])
        print(str(total),end="\t")
        print(str(len(SNP[i][j])),end="\t")
        print(str(len(Indel[i][j])),end="\n")
    print("\n")

NSK-0912-175_N702-N505
Samtools  	1387	1331	56
GATK      	1467	1357	110
GATK_GVCF 	1418	1343	75
Deepvariant	1320	1256	64


NSK-0912-176_AK9183-N505
Samtools  	88	52	36
GATK      	128	60	68
GATK_GVCF 	67	45	22
Deepvariant	43	31	12


NSK-0912-177_N704-N505
Samtools  	1533	1472	61
GATK      	1619	1517	102
GATK_GVCF 	1570	1501	69
Deepvariant	1514	1448	66


NSK-0912-178_N705-N505
Samtools  	1895	1822	73
GATK      	2165	2036	129
GATK_GVCF 	2106	2027	79
Deepvariant	1537	1472	65


NSK-0912-180_N707-N505
Samtools  	853	826	27
GATK      	958	888	70
GATK_GVCF 	905	878	27
Deepvariant	793	770	23


NSK-0912-181_N708-N505
Samtools  	993	946	47
GATK      	1082	977	105
GATK_GVCF 	1019	972	47
Deepvariant	908	862	46


NSK-0912-182_N709-N505
Samtools  	1724	1662	62
GATK      	1870	1725	145
GATK_GVCF 	1789	1702	87
Deepvariant	1522	1456	66




### Indel Analysis

In [16]:
indels = {}
for i in IDs:
    indels[i] = {}
    for j in callers:
        indels[i][j] = Indel[j][i] 

In [17]:
specificIndels = {}
for i in IDs:
    specificIndels[i] = specific(indels[i])
    print(i)
    for j in callers:
        if j in specificIndels[i].keys():
            print(j.ljust(30)+str(len(specificIndels[i][j])).ljust(10)+str(len(indels[i][j])))
        else:
            print(j.ljust(30)+str(0).ljust(10)+str(len(indels[i][j])))

NSK-0912-175_N702-N505
Samtools                      13        56
GATK                          30        110
GATK_GVCF                     0         75
Deepvariant                   9         64
NSK-0912-176_AK9183-N505
Samtools                      33        36
GATK                          48        68
GATK_GVCF                     4         22
Deepvariant                   8         12
NSK-0912-177_N704-N505
Samtools                      12        61
GATK                          29        102
GATK_GVCF                     0         69
Deepvariant                   9         66
NSK-0912-178_N705-N505
Samtools                      27        73
GATK                          45        129
GATK_GVCF                     1         79
Deepvariant                   10        65
NSK-0912-180_N707-N505
Samtools                      12        27
GATK                          40        70
GATK_GVCF                     2         27
Deepvariant                   3         23
NSK-0912-181_N708-N5

In [18]:
specificIndels = {}
for i in IDs:
    specificIndels[i] = specific(indels[i])
    print(i)
    for j in callers:
        if j in specificIndels[i].keys():
            print(j.ljust(30)+str(len(specificIndels[i][j])).ljust(10)+str(len(indels[i][j])))
        else:
            print(j.ljust(30)+str(0).ljust(10)+str(len(indels[i][j])))

NSK-0912-175_N702-N505
Samtools                      13        56
GATK                          30        110
GATK_GVCF                     0         75
Deepvariant                   9         64
NSK-0912-176_AK9183-N505
Samtools                      33        36
GATK                          48        68
GATK_GVCF                     4         22
Deepvariant                   8         12
NSK-0912-177_N704-N505
Samtools                      12        61
GATK                          29        102
GATK_GVCF                     0         69
Deepvariant                   9         66
NSK-0912-178_N705-N505
Samtools                      27        73
GATK                          45        129
GATK_GVCF                     1         79
Deepvariant                   10        65
NSK-0912-180_N707-N505
Samtools                      12        27
GATK                          40        70
GATK_GVCF                     2         27
Deepvariant                   3         23
NSK-0912-181_N708-N5

### IGV check

In [19]:
folders = glob.glob("/data/yangyusheng/igvImages/IGV_results/*")

In [20]:
igvResults = {}
for i in folders:
    igvResults[i] = glob.glob("{0}/*.png".format(i))

In [21]:
newIGVresults = {}
for i in folders:
    newIGVresults[i] = []
    for j in igvResults[i]:
        j = re.sub('/.*/','',j)
        newIGVresults[i].append(j)

In [22]:
check = {}
for i in IDs:
    check[i] = {}
    for j in specificIndels[i]['GATK_GVCF']:
        info = re.split('\t',j)
        va = i+"_"+info[0]+"_"+info[1]+"_"+info[2]+"_"+info[3]+".png"
        for k in folders:
            if va in newIGVresults[k]:
                if k in check[i].keys():
                    check[i][k].append(va)
                else:
                    check[i][k] = []
                    check[i][k].append(va)
            

KeyError: 'GATK_GVCF'

In [None]:
for i in IDs:
    print(i)
    for k in folders:
        print(k.ljust(70),end="")
        if k in check[i].keys():
            print(len(check[i][k]),end="\n")
        else:
            print("0",end="\n")

In [None]:
check['NSK-0912-182_N709-N505']['/data/yangyusheng/igvImages/IGV_results/TruePositive']

In [None]:
specificIndels['NSK-0912-180_N707-N505']['GATK_GVCF']