### Context:

52 SNP locations were PCR amplified and the amplicons ligated before nanopore sequencing. Here only the SNP regions (51 nt) and not the larger amplicons are identified using fuzzy regex.  So we forget about primer sequences and just lookup the SNP and its flanking regions.


In [1]:
# Init
#

readFile      = '/media/sf_vm_shared/nanopore/results/ligated_snp_amplicon_reads.fastq'
snpRegionFile = '/media/sf_vm_shared/nanopore/snp_data/SNP_sequence_orig.fasta'
resultDir     = '/media/sf_vm_shared/nanopore/results'
bwa           = '/usr/bin/bwa'              # v0.7.12, on holmes we have v0.7.5 /opt/tools/bwa
samtools      = '/usr/local/bin/samtools'   # v1.3.1,  on holmes we have v1.2   /opt/samtools-1.2
bcftools      = '/usr/local/bin/bcftools'   # v1.3.1,  on holmes we have v1.2   /opt/bcftools-1.2

In [2]:
# Utility functions
#

def reverseComplement(seq):
  transTab = str.maketrans('agctyrwskmdvhbAGCTYRWSKMDVHB', 'tcgarywsmkhbdvTCGARYWSMKHBDV')
  return seq.translate(transTab)[::-1]


In [3]:
# Get SNP definitions
#

snpDefs    = {}
snpRefFile = resultDir + '/snp_region_references.fasta'

with open(snpRegionFile) as fasta:
  cnt = 0
  for line in fasta:
    if line[0] == '>':
      snpName = line[1:].strip()
    else:
      # AGCTAGCT[A/G]AGCTAGCT
      i = line.index('[')
      j = line.index(']')
      
      snpDefs[snpName] = {
        'left':  line[:i].strip(),
        'right': line[j+1:].strip(),
        'variants': line[i+1:j].split('/')
      }

print('Got data for {} SNPs'.format(len(snpDefs)))

with open(snpRefFile, 'w') as fasta:
  for snpDef in sorted(snpDefs):
    fasta.write('>' + snpDef + '\n')
    fasta.write(snpDefs[snpDef]['left'] + snpDefs[snpDef]['variants'][0] + snpDefs[snpDef]['right'] + '\n')
    
    # DEBUG
    print(snpDef, snpDefs[snpDef])
    

Got data for 52 SNPs
rs1005533 {'right': 'AGTCGCCGCTGTTCAGGGGAGGCAT', 'left': 'AGCAAAAAGCAAGAGCCGTGGAATT', 'variants': ['A', 'G']}
rs1015250 {'right': 'CTGTCCCAGTGATTTTTCATAAGCA', 'left': 'AGGAAAAGAACCCAGGTGTTTTATT', 'variants': ['C', 'G']}
rs1024116 {'right': 'CACTTAATAAAGTATGCCTTTGTAT', 'left': 'TGTTCTAATAAAAAGGATTGCTCAT', 'variants': ['A', 'G']}
rs1028528 {'right': 'CTGTGTGCAGATCCGCGGAGGTCTG', 'left': 'AAAAGAAAGGTCCTTACTCGACATC', 'variants': ['A', 'G']}
rs1029047 {'right': 'AAAAAAAAACCTCATATCTTTTTTC', 'left': 'AAAGTAAGAATTCAAGATGGTATTT', 'variants': ['A', 'T']}
rs1031825 {'right': 'CCCCGAGCATACTTGAAAGCAGTGA', 'left': 'TTTAATGAGTATTTTATTTATCTAA', 'variants': ['A', 'C']}
rs10495407 {'right': 'TCCTATTCCATTCTGTTGTTTTTCC', 'left': 'TGGTTGCATTGGATTCTCATTGAAA', 'variants': ['A', 'G']}
rs1335873 {'right': 'ATACTGAGTACATAGCTAGGTACCT', 'left': 'TGCAGGTATGTATTGTTGGCCGTGG', 'variants': ['A', 'T']}
rs1355366 {'right': 'GGACTTGCCAAAGCCAGTTGTGGCC', 'left': 'AGAGCCACTGGAGGCCTCGAGGATG', 'variants': 

In [4]:
# Load sequencing data (ligated amplicons)
#

readData = {}

with open(readFile) as f:
  cnt = 0

  for l in f:
    cnt += 1
    if cnt % 4 == 1:
      seqName = l.strip()
    elif cnt % 4 == 2:
      seqData = l.strip()
      readData[seqName] = seqData
    else:
      continue
            
print('Loaded {} sequences'.format(len(readData)))


Loaded 14324 sequences


In [5]:
# Lookup SNP regions
#

import regex

maxMisMatch  = 3
matchData    = {}
notFoundList = []

for snpDef in snpDefs:
  #pattern           = regex.compile('(?e)({}.{}){{e<={}}}'.format(snpDefs[snpDef]['left'], snpDefs[snpDef]['right'], maxMisMatch))
  pattern           = regex.compile('(?e)({}[{}]{}){{e<={}}}'.format(snpDefs[snpDef]['left'], ''.join(snpDefs[snpDef]['variants']), snpDefs[snpDef]['right'], maxMisMatch))
  matchData[snpDef] = []
  
  # Print locus name to indicate progress
  print(snpDef, snpDefs[snpDef]['variants'], pattern)
    
  for seq in readData:
    # Scan the sense strand
    for match in pattern.finditer(readData[seq]):
      b, e     = match.span()
      seqMatch = readData[seq][b:e]
      matchData[snpDef].append((seq, 'sense', b, e, seqMatch))
      
    # Scan the anti-sense strand
    rcSeq = reverseComplement(readData[seq])
    for match in pattern.finditer(rcSeq):
      b, e     = match.span()
      seqMatch = rcSeq[b:e]
      matchData[snpDef].append((seq, 'anti-sense', b, e, seqMatch))
      
  print('  Identified {} regions allowing up to {} errors'.format(len(matchData[snpDef]), maxMisMatch))
  
  if len(matchData[snpDef]) == 0:
    notFoundList.append('{} {} {}'.format(snpDef, snpDefs[snpDef]['variants'], pattern))
  
# Print not found
print()
print('Amplicons not found:')
for a in notFoundList:
  print(a)

print('Done')

rs2076848 ['A', 'T'] regex.Regex('(?e)(TGGCCTCACCACCAGAAATCAGGGC[AT]TGATGGACCTGAAGCGGTCCCGGGC){e<=3}', flags=regex.V0)
  Identified 81 regions allowing up to 3 errors
rs1029047 ['A', 'T'] regex.Regex('(?e)(AAAGTAAGAATTCAAGATGGTATTT[AT]AAAAAAAAACCTCATATCTTTTTTC){e<=3}', flags=regex.V0)
  Identified 0 regions allowing up to 3 errors
rs917118 ['C', 'T'] regex.Regex('(?e)(AAGATGGAGTCAACATTTTACAAGA[CT]GCTCGTTGACCTCAGTCATCTCTTA){e<=3}', flags=regex.V0)
  Identified 446 regions allowing up to 3 errors
rs1490413 ['A', 'G'] regex.Regex('(?e)(ACTGGGCTGATGTGGGTTCTTTGCA[AG]AACTGGCTGGCCTCAGAGCAGGGAC){e<=3}', flags=regex.V0)
  Identified 275 regions allowing up to 3 errors
rs1335873 ['A', 'T'] regex.Regex('(?e)(TGCAGGTATGTATTGTTGGCCGTGG[AT]ATACTGAGTACATAGCTAGGTACCT){e<=3}', flags=regex.V0)
  Identified 235 regions allowing up to 3 errors
rs719366 ['C', 'T'] regex.Regex('(?e)(CCTGCTTTTCCTCCTCCCATTCTAG[CT]AGCTACTCCTCTGGGGGCCTGTCCT){e<=3}', flags=regex.V0)
  Identified 165 regions allowing up to 3 erro

In [6]:
# Save identified SNP regions
#

snpFile = '/media/sf_vm_shared/nanopore/results/potential_snp_regions_{}mism.fasta'.format(maxMisMatch)
cnt     = 0

with open(snpFile, 'w') as f:
  for snp in sorted(matchData):
    for match in matchData[snp]:
      cnt += 1
      # (seq, 'sense', b, e, seqMatch)
      f.write('>{} {} {} {}\n'.format(cnt, snp, match[1], match[0]))
      f.write(match[4] + '\n')

      
print('Saved {} regions for {} SNPs'.format(cnt, len(matchData)))

Saved 10202 regions for 52 SNPs


In [9]:
# Map SNP regions to reference sequences
#
resultFileBase = 'ttt_{}mism_snpregions'.format(maxMisMatch)

# Build index of the references
!{bwa} index {snpRefFile}

# Map reads (use BWA's nanopore specific option)
!{bwa} mem -t 2 -x ont2d {snpRefFile} {snpFile} > {resultDir}/{resultFileBase}.sam


[bwa_index] Pack FASTA... 0.00 sec
[bwa_index] Construct BWT for the packed sequence...
[bwa_index] 0.00 seconds elapse.
[bwa_index] Update BWT... 0.00 sec
[bwa_index] Pack forward-only FASTA... 0.00 sec
[bwa_index] Construct SA from BWT and Occ... 0.01 sec
[main] Version: 0.7.12-r1039
[main] CMD: /usr/bin/bwa index /media/sf_vm_shared/nanopore/results/snp_region_references.fasta
[main] Real time: 0.089 sec; CPU: 0.014 sec
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 10202 sequences (516294 bp)...
[M::mem_process_seqs] Processed 10202 reads in 0.318 CPU sec, 0.198 real sec
[main] Version: 0.7.12-r1039
[main] CMD: /usr/bin/bwa mem -t 2 -x ont2d /media/sf_vm_shared/nanopore/results/snp_region_references.fasta /media/sf_vm_shared/nanopore/results/potential_snp_regions_3mism.fasta
[main] Real time: 0.289 sec; CPU: 0.354 sec


In [10]:
# Make sorted bam and index
#

!{samtools} view -Sbu {resultDir}/{resultFileBase}.sam | {samtools} sort -o {resultDir}/{resultFileBase}_sorted.bam -
!{samtools} index {resultDir}/{resultFileBase}_sorted.bam {resultDir}/{resultFileBase}_sorted.bam.bai


In [11]:
# Display some mapping stats
#

!{samtools} flagstat {resultDir}/{resultFileBase}_sorted.bam


10202 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
9997 + 0 mapped (97.99% : N/A)
0 + 0 paired in sequencing
0 + 0 read1
0 + 0 read2
0 + 0 properly paired (N/A : N/A)
0 + 0 with itself and mate mapped
0 + 0 singletons (N/A : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)


In [12]:
# Generate vcf file from bam file. Needs the reference and its index file 
#
# Note: the commands below are for samtools and bcftools v1.3.1 (will not work on v0.1.19!)

# Reporting all positions (skip indels)
!{samtools} mpileup -d 100000 -uf {snpRefFile} {resultDir}/{resultFileBase}_sorted.bam | {bcftools} call -V indels -m - > {resultDir}/{resultFileBase}_sorted.bam.vcf

# Reporting variants only (excludes SNPs homozygous for reference allele) (skip indels)
!{samtools} mpileup -d 100000 -uf {snpRefFile} {resultDir}/{resultFileBase}_sorted.bam | {bcftools} call -V indels -mv - > {resultDir}/{resultFileBase}_no_indels_sorted.bam.vcf



[mpileup] 1 samples in 1 input files
Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid
Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid
[mpileup] 1 samples in 1 input files


In [13]:
# Get SNP profile
#

snpData = {}

with open(resultDir+'/' + resultFileBase + '_sorted.bam.vcf') as f:
    for l in f:
        if l.startswith('#'):
            continue
            
        snp, pos, id, ref, alt, qual, filter, info, d, dd = l.split()
        
        # Our SNP of interest is always at position 26 of the reference
        if int(pos) != 26:
            continue

        par = {}
        for p in info.split(';'):
            pv = p.split('=')
            par[pv[0]] = pv[1]
        
        snpData[snp] = {'pos': pos, 'ref': ref, 'alt': alt, 'qual': qual, 'filter': filter, 'info': par}

# DEBUG
print('Got data for {} SNPs:'.format(len(snpData)))

# Save/print results
with open(resultDir +'/' + resultFileBase + '_profile.csv', 'w') as f:
    # Table header
    f.write('snp, coverage, ref_allele, ref_percent, alt_allele, alt_percent, other_percent, genotype, real, comment, seq\n')
    
    # Table data
    for s in sorted(snpData):
        totalDepth = int(snpData[s]['info']['DP'])
        depthList  = [int(d) for d in snpData[s]['info']['DP4'].split(',')]
        refDepth   = sum(depthList[0:2])
        altDepth   = sum(depthList[2:4])
        
        # Estimate the diploid genotype: when the minor allele is more than 10 times weaker than the major allele,
        # we should ignore it for a pure sample?
        if refDepth > altDepth and altDepth/refDepth < 0.1:
            genotype = snpData[s]['ref'] + snpData[s]['ref']
        elif altDepth > refDepth and refDepth/altDepth < 0.1:
            genotype = snpData[s]['alt'] + snpData[s]['alt']
        else:
            genotype = snpData[s]['ref'] + snpData[s]['alt']
        
        seq = snpDefs[s]['left'] + '[' + '/'.join(snpDefs[s]['variants']) + ']' + snpDefs[s]['right']
        
        if snpData[s]['alt'] == '.':
            # Only 1 allele was observed
            f.write(','.join([s, str(totalDepth), snpData[s]['ref'], '{:.1f}'.format(100*refDepth/totalDepth), '', '', '{:.1f}'.format(100.0-100*refDepth/totalDepth), snpData[s]['ref']+snpData[s]['ref'], '', '', seq + '\n']))
            # DEBUG
            print('  {} ({})  {} ({:.1f} %) [{}]'.format(s, totalDepth, snpData[s]['ref'], 100*refDepth/totalDepth, snpData[s]['qual']))
        else:
            # Two alleles were observed
            f.write(','.join([s, str(totalDepth), snpData[s]['ref'], '{:.1f}'.format(100*refDepth/totalDepth), snpData[s]['alt'], '{:.1f}'.format(100*altDepth/totalDepth), '{:.1f}'.format(100.0-100*refDepth/totalDepth-100*altDepth/totalDepth), genotype, '', '', seq + '\n']))
            # DEBUG
            print('  {} ({})  {} ({:.1f} %)  {} ({:.1f} %) [{}]'.format(s, totalDepth, snpData[s]['ref'], 100*refDepth/totalDepth, snpData[s]['alt'], 100*altDepth/totalDepth, snpData[s]['qual']))


Got data for 50 SNPs:
  rs1005533 (135)  A (58.5 %)  G (41.5 %) [222]
  rs1015250 (144)  C (0.0 %)  G (100.0 %) [228]
  rs1024116 (173)  A (28.3 %)  G (71.7 %) [222]
  rs1028528 (198)  A (0.0 %)  G (100.0 %) [228]
  rs1031825 (233)  A (24.5 %)  C (75.5 %) [221]
  rs10495407 (282)  A (52.8 %)  G (47.2 %) [222]
  rs1335873 (214)  A (1.4 %)  T (98.6 %) [228]
  rs1355366 (268)  A (2.2 %)  G (97.8 %) [228]
  rs1357617 (218)  A (99.5 %) [0]
  rs1360288 (426)  C (48.1 %)  T (51.9 %) [222]
  rs1382387 (270)  G (1.1 %)  T (98.9 %) [228]
  rs1413212 (209)  A (52.2 %)  G (47.8 %) [222]
  rs1454361 (91)  A (100.0 %) [0]
  rs1463729 (202)  A (93.6 %) [0]
  rs1490413 (272)  A (98.5 %) [0]
  rs1493232 (63)  A (60.3 %)  C (39.7 %) [222]
  rs1528460 (178)  C (1.1 %)  T (98.9 %) [228]
  rs1886510 (89)  C (2.2 %)  T (97.8 %) [228]
  rs1979255 (188)  C (98.4 %) [0]
  rs2016276 (102)  A (88.2 %)  G (11.8 %) [56]
  rs2040411 (159)  A (0.6 %)  G (99.4 %) [228]
  rs2046361 (227)  A (5.3 %)  T (94.7 %) [228]
 

### Notes
* Allowing 0 mismatch does not retrieve regions for 7 SNP's and has lower than 10 coverage for 14 of the retrieved ones.  This results in too many unreliable calls.
* Allowing 1 mismatch does not retrieve regions for 2 SNP's.  Regions have a coverage between 1 and 146.  Still lower than 10 coverage for 5 regions resulting in unreliable calls.
* Allowing 2 mismatch does not retrieve regions for 2 SNP's.  All retrieved regions have coverage between 18 and 282. Using an minimum observation frequency threshold of 10%, 3 SNP's are miscalled (homozygotes seen as heterozygous).  When using a threshold of 25%, only one SNP is miscalled.
* Allowing 3 mismatch does not retrieve regions for 2 SNP's. Regions have a coverage between 62 and 444.

#### Not detected with up to 3 allowed mismatches (not including the SNP itself)
rs1029047 ['A', 'T'] regex.Regex('(?e)(AAAGTAAGAATTCAAGATGGTATTT[AT]AAAAAAAAACCTCATATCTTTTTTC){e<=1}', flags=regex.V0)

rs891700 ['A', 'G'] regex.Regex('(?e)(TTCCATTCTTTTTTTTTTGAAGCCT[AG]CTTGCATAGTTCTAAGGAGTGTCAT){e<=1}', flags=regex.V0)

