## Tri-allelic SNPs Illumina (re-run)

* New libraries
* Original fastq: 9948 and 9947 are switched!



In [2]:
# Init
#
import os, glob

projectDir    = '/media/genomics/nanopore/projects/tri-allelic_SNPs/20180219_illumina_analysis_yg'
rawDataDir    = '/media/genomics/nanopore/run_data/20180219_Illumina_tri-allelic/PCRfree_Senne-65584528'
fastQDir      = os.path.join(projectDir, 'raw_data')
mappedDataDir = os.path.join(projectDir, 'mapped_data')
snpRefFile    = os.path.join(projectDir, 'triallelic_snp_regions.fasta')
sampleList    = ['2800', '9947', '9948', 'G59', 'G62']

bwa           = '/opt/tools/bwa-0.7.15'
samtools      = '/opt/tools/samtools-1.3.1'
bcftools      = '/opt/tools/bcftools-1.3.1'

# Create dirs
for d in [projectDir, fastQDir, mappedDataDir]:
  if not os.path.exists(d):
    print('Creating directory {}'.format(d))
    os.makedirs(d)
  else:
    print('Directory {} exists'.format(d))


Directory /media/genomics/nanopore/tri-allelic_SNPS/20180219_illumina_analysis_yg exists
Directory /media/genomics/nanopore/tri-allelic_SNPS/20180219_illumina_analysis_yg/raw_data exists
Directory /media/genomics/nanopore/tri-allelic_SNPS/20180219_illumina_analysis_yg/mapped_data exists


In [10]:
# Copy and rename fastq files (handle swithed samples)
#
fastqNameMap = {
  '2800_S1_L001_R1_001.fastq': '2800_R1.fastq',
  '2800_S1_L001_R2_001.fastq': '2800_R2.fastq',
  '9947_S3_L001_R1_001.fastq': '9948_R1.fastq',
  '9947_S3_L001_R2_001.fastq': '9948_R2.fastq',
  '9948_S2_L001_R1_001.fastq': '9947_R1.fastq',
  '9948_S2_L001_R2_001.fastq': '9947_R2.fastq',
  'G59_S5_L001_R1_001.fastq':  'G59_R1.fastq',
  'G59_S5_L001_R2_001.fastq':  'G59_R2.fastq',
  'G62_S4_L001_R1_001.fastq':  'G62_R1.fastq',
  'G62_S4_L001_R2_001.fastq':  'G62_R2.fastq'
}

for fastqOld, fastqNew in fastqNameMap.items():
  print('{} -> {}'.format(fastqOld, fastqNew))
  ! cp {os.path.join(rawDataDir, fastqOld)} {os.path.join(fastQDir, fastqNew)}
  
print('Done')

2800_S1_L001_R1_001.fastq -> 2800_R1.fastq
2800_S1_L001_R2_001.fastq -> 2800_R2.fastq
9947_S3_L001_R1_001.fastq -> 9948_R1.fastq
9947_S3_L001_R2_001.fastq -> 9948_R2.fastq
9948_S2_L001_R1_001.fastq -> 9947_R1.fastq
9948_S2_L001_R2_001.fastq -> 9947_R2.fastq
G59_S5_L001_R1_001.fastq -> G59_R1.fastq
G59_S5_L001_R2_001.fastq -> G59_R2.fastq
G62_S4_L001_R1_001.fastq -> G62_R1.fastq
G62_S4_L001_R2_001.fastq -> G62_R2.fastq
Done


In [6]:
# Index reference
#
! {bwa} index {snpRefFile}

print('Done')

[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.00 sec
[main] Version: 0.7.15-r1140
[main] CMD: /opt/tools/bwa-0.7.15 index /media/genomics/nanopore/tri-allelic_SNPS/20180219_illumina_analysis_yg/triallelic_snp_regions.fasta
[main] Real time: 0.026 sec; CPU: 0.009 sec
Done


In [3]:
# Map reads to reference SNP region sequences (51 nt), call variants
#
for sampleName in sorted(sampleList):
  # Map amplicons
  read1File = os.path.join(fastQDir, '{}_R1.fastq'.format(sampleName))
  read2File = os.path.join(fastQDir, '{}_R2.fastq'.format(sampleName))
  bamFile   = os.path.join(mappedDataDir, '{}_direct_mapping.bam'.format(sampleName))
  vcfFile   = bamFile.replace('.bam', '.vcf')
  
  ! {bwa} mem -t 10 {snpRefFile} {read1File}  {read2File} | {samtools} view -Sb - | {samtools} sort -o {bamFile} -
  ! {samtools} index {bamFile}
  
  # Variant calling
  !{samtools} mpileup -d 100000 -Buf {snpRefFile} -t AD {bamFile} | {bcftools} call -V indels -m - > {vcfFile}
  

print('Done')

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 381286 sequences (57574186 bp)...
[M::mem_pestat] # candidate unique pairs for (FF, FR, RF, RR): (0, 29066, 0, 0)
[M::mem_pestat] skip orientation FF as there are not enough pairs
[M::mem_pestat] analyzing insert size distribution for orientation FR...
[M::mem_pestat] (25, 50, 75) percentile: (50, 50, 50)
[M::mem_pestat] low and high boundaries for computing mean and std.dev: (50, 50)
[M::mem_pestat] mean and std.dev: (50.00, 0.00)
[M::mem_pestat] low and high boundaries for proper pairs: (50, 50)
[M::mem_pestat] skip orientation RF as there are not enough pairs
[M::mem_pestat] skip orientation RR as there are not enough pairs
[M::mem_process_seqs] Processed 381286 reads in 22.158 CPU sec, 2.437 real sec
[main] Version: 0.7.15-r1140
[main] CMD: /opt/tools/bwa-0.7.15 mem -t 10 /media/genomics/nanopore/tri-allelic_SNPS/20180219_illumina_analysis_yg/triallelic_snp_regions.fasta /media/genomics/nanopore/tri-allelic_SNPS/20180

In [4]:
def genotype(vcfFile, snpPos):
  """
  Extract genotype data at given position from a given vcf file.
  """
  genotypeData = {}
  
  with open (vcfFile, 'rt') as inFile:
    for line in inFile:
      if line.startswith('#'): continue
      
      snpName, pos, id, ref, alt, qual, filt, info, form, formValues = line.rstrip().split()
      
      if int(pos) == snpPos:
        k        = form.split(':')
        v        = formValues.split(':')         
        formData = {}
        
        for i in range(len(k)):
          formData[k[i]] = v[i]
          
        infoData = {}
        for t in info.split(';'):
          k,v = t.split('=')
          infoData[k] = v
          
        alleles = ref
        if alt != '.':
          alleles += ''.join(alt.split(','))
          
        depths                = [int(d) for d in formData['AD'].split(',')]
        gt                    = [alleles[int(i)] for i in formData['GT'].split('/') ]
        genotypeData[snpName] = {'pos': pos, 'alleles': {'A': 0, 'G': 0, 'C': 0, 'T':0}, 'genotype': '/'.join(gt), 'depth': int(infoData['DP'])}
        
        for i in range(len(alleles)):
          genotypeData[snpName]['alleles'][alleles[i]] = depths[i]
        
  return genotypeData


In [5]:
# Get all sample genotypes
#
excludeSnp = [] #['rs9274701'] # Amplicon is found, but content is not as expected
genotypes  = {}

for sampleName in sorted(sampleList):
  print("SNP's for sample {}".format(sampleName))
  genotypes[sampleName] = {}
  vcfFile               = os.path.join(mappedDataDir, '{}_direct_mapping.vcf'.format(sampleName))
  
  d = genotype(vcfFile, 26)
  
  for snpName in sorted(d):
    t = sum(d[snpName]['alleles'].values())
    
    # Likely genotype
    gg = {}
    vv = sorted([int(c) for c in d[snpName]['alleles'].values()], reverse=True)
    
    for g, c in d[snpName]['alleles'].items():
      if c not in gg:
        gg[c] = [g]
      else:
        gg[c].append(g)
    # Second allele must be at least 10% of main allele or it is ignored.
    # Third and fourth alleles are always ignored if present.
    #if len(vv) > 1 and vv[1] > vv[0]/10:
    # Second allele must be at least 25% of main allele or it is ignored.
    if len(vv) > 1 and vv[1] > vv[0]/4:
      if len(gg[vv[0]]) > 1:
        aa = ''.join(gg[vv[0]])
      else:
        aa = gg[vv[0]][0] + gg[vv[1]][0]
    else:
      aa = gg[vv[0]][0] + gg[vv[0]][0]
    
    o = '  {:<14}  '.format(snpName)
    
    for g in sorted(d[snpName]['alleles']):
      o += '  {}:{:<5}'.format(g, d[snpName]['alleles'][g])
    for g in sorted(d[snpName]['alleles']):
      if t > 0:
        o += '  {}:{:>3}%'.format(g, round(100*d[snpName]['alleles'][g]/t))
      else:
        o += '  {}:{:>3}%'.format(g, '???')
    o += '    {:>5}/{:<5} reads'.format(t, d[snpName]['depth'])
    o += '    {}'.format(''.join(sorted(aa)))
    genotypes[sampleName][snpName] = ''.join(sorted(aa))
    print(o)
  
  print('')
  

SNP's for sample 2800
  rs1008686         A:0      C:0      G:0      T:1928   A:  0%  C:  0%  G:  0%  T:100%     1928/3876  reads    TT
  rs1112534         A:0      C:1347   G:0      T:0      A:  0%  C:100%  G:  0%  T:  0%     1347/2700  reads    CC
  rs17287498        A:253    C:1      G:318    T:0      A: 44%  C:  0%  G: 56%  T:  0%      572/1144  reads    AG
  rs2032582         A:1879   C:0      G:8      T:1844   A: 50%  C:  0%  G:  0%  T: 49%     3731/7471  reads    AT
  rs2069945         A:0      C:0      G:423    T:0      A:  0%  C:  0%  G:100%  T:  0%      423/848   reads    GG
  rs2307223         A:2240   C:0      G:0      T:0      A:100%  C:  0%  G:  0%  T:  0%     2240/4492  reads    AA
  rs2853525         A:0      C:415    G:0      T:367    A:  0%  C: 53%  G:  0%  T: 47%      782/1564  reads    CT
  rs3091244         A:0      C:148    G:0      T:0      A:  0%  C:100%  G:  0%  T:  0%      148/298   reads    CC
  rs34741930        A:0      C:2      G:0      T:0      A:  0%  C: