# Tri-allelic SNPs Illumina 

##1) Loading sequencing data

##2) Mapping of the reads against the reference
    Direct mapping using BWA-0.7.15

##3) Determine the SNP genotype on position 26 in the mapping


###1) Loading sequencing data

In [None]:
# Init

import os, glob

projectDir    = '/media/genomics/nanopore/projects/tri-allelic_SNPs/20180219_illumina_analysis_yg'
rawDataDir    = '/media/genomics/nanopore/run_data/20180219_Illumina_tri-allelic/PCRfree_Senne-65584528'
fastQDir      = os.path.join(projectDir, 'raw_data')
mappedDataDir = os.path.join(projectDir, 'mapped_data')
snpRefFile    = os.path.join(projectDir, 'triallelic_snp_regions.fasta')
sampleList    = ['2800', '9947', '9948', 'Gednap50_Person_C', 'Gednap51_Person_C']

bwa           = '/opt/tools/bwa-0.7.15'
samtools      = '/opt/tools/samtools-1.3.1'
bcftools      = '/opt/tools/bcftools-1.3.1'

# Create dirs
for d in [projectDir, fastQDir, mappedDataDir]:
  if not os.path.exists(d):
    print('Creating directory {}'.format(d))
    os.makedirs(d)
  else:
    print('Directory {} exists'.format(d))

In [None]:
# Copy and rename fastq files (handle swithed samples)

fastqNameMap = {
  '2800_S1_L001_R1_001.fastq': '2800_R1.fastq',
  '2800_S1_L001_R2_001.fastq': '2800_R2.fastq',
  '9947_S3_L001_R1_001.fastq': '9948_R1.fastq',
  '9947_S3_L001_R2_001.fastq': '9948_R2.fastq',
  '9948_S2_L001_R1_001.fastq': '9947_R1.fastq',
  '9948_S2_L001_R2_001.fastq': '9947_R2.fastq',
  'G59_S5_L001_R1_001.fastq':  'G59_R1.fastq',
  'G59_S5_L001_R2_001.fastq':  'G59_R2.fastq',
  'G62_S4_L001_R1_001.fastq':  'G62_R1.fastq',
  'G62_S4_L001_R2_001.fastq':  'G62_R2.fastq'
}

for fastqOld, fastqNew in fastqNameMap.items():
  print('{} -> {}'.format(fastqOld, fastqNew))
  ! cp {os.path.join(rawDataDir, fastqOld)} {os.path.join(fastQDir, fastqNew)}
  
print('Done')

###2) Mapping of the reads against the reference

In [None]:
# Index reference

! {bwa} index {snpRefFile}

print('Done')

In [None]:
# Map reads to reference SNP region sequences (51 nt), call variants
#
for sampleName in sorted(sampleList):
  # Map amplicons
  read1File = os.path.join(fastQDir, '{}_R1.fastq'.format(sampleName))
  read2File = os.path.join(fastQDir, '{}_R2.fastq'.format(sampleName))
  bamFile   = os.path.join(mappedDataDir, '{}_direct_mapping.bam'.format(sampleName))
  vcfFile   = bamFile.replace('.bam', '.vcf')
  
  ! {bwa} mem -t 10 {snpRefFile} {read1File}  {read2File} | {samtools} view -Sb - | {samtools} sort -o {bamFile} -
  ! {samtools} index {bamFile}
  
  # Variant calling
  !{samtools} mpileup -d 100000 -Buf {snpRefFile} -t AD {bamFile} | {bcftools} call -V indels -m - > {vcfFile}
  

print('Done')

###3) Determine the SNP genotype on position 26 in the mapping

In [4]:
def genotype(vcfFile, snpPos):
  """
  Extract genotype data at given position from a given vcf file.
  """
  genotypeData = {}
  
  with open (vcfFile, 'rt') as inFile:
    for line in inFile:
      if line.startswith('#'): continue
      
      snpName, pos, id, ref, alt, qual, filt, info, form, formValues = line.rstrip().split()
      
      if int(pos) == snpPos:
        k        = form.split(':')
        v        = formValues.split(':')         
        formData = {}
        
        for i in range(len(k)):
          formData[k[i]] = v[i]
          
        infoData = {}
        for t in info.split(';'):
          k,v = t.split('=')
          infoData[k] = v
          
        alleles = ref
        if alt != '.':
          alleles += ''.join(alt.split(','))
          
        depths                = [int(d) for d in formData['AD'].split(',')]
        gt                    = [alleles[int(i)] for i in formData['GT'].split('/') ]
        genotypeData[snpName] = {'pos': pos, 'alleles': {'A': 0, 'G': 0, 'C': 0, 'T':0}, 'genotype': '/'.join(gt), 'depth': int(infoData['DP'])}
        
        for i in range(len(alleles)):
          genotypeData[snpName]['alleles'][alleles[i]] = depths[i]
        
  return genotypeData


In [None]:
# Get all sample genotypes
#
excludeSnp = [] #['rs9274701'] # Amplicon is found, but content is not as expected
genotypes  = {}

for sampleName in sorted(sampleList):
  print("SNP's for sample {}".format(sampleName))
  genotypes[sampleName] = {}
  vcfFile               = os.path.join(mappedDataDir, '{}_direct_mapping.vcf'.format(sampleName))
  
  d = genotype(vcfFile, 26)
  
  for snpName in sorted(d):
    t = sum(d[snpName]['alleles'].values())
    
    # Likely genotype
    gg = {}
    vv = sorted([int(c) for c in d[snpName]['alleles'].values()], reverse=True)
    
    for g, c in d[snpName]['alleles'].items():
      if c not in gg:
        gg[c] = [g]
      else:
        gg[c].append(g)
    # Second allele must be at least 25% of main allele or it is ignored.
    if len(vv) > 1 and vv[1] > vv[0]/4:
      if len(gg[vv[0]]) > 1:
        aa = ''.join(gg[vv[0]])
      else:
        aa = gg[vv[0]][0] + gg[vv[1]][0]
    else:
      aa = gg[vv[0]][0] + gg[vv[0]][0]
    
    o = '  {:<14}  '.format(snpName)
    
    for g in sorted(d[snpName]['alleles']):
      o += '  {}:{:<5}'.format(g, d[snpName]['alleles'][g])
    for g in sorted(d[snpName]['alleles']):
      if t > 0:
        o += '  {}:{:>3}%'.format(g, round(100*d[snpName]['alleles'][g]/t))
      else:
        o += '  {}:{:>3}%'.format(g, '???')
    o += '    {:>5}/{:<5} reads'.format(t, d[snpName]['depth'])
    o += '    {}'.format(''.join(sorted(aa)))
    genotypes[sampleName][snpName] = ''.join(sorted(aa))
    print(o)
  
  print('')
  