## Ananlysis of 1D2 reads

### 1) Recover 1D2 reads with a Phred Quality > 10
    Retrieve all 1D reads with q >= 10 (the 1D2 reads with q<11 are normally split into the fail group)
    No new basecalling, we just filter the better reads from the fail set using NanoFilter
### 2) Split reads per sample using barcodes to identify each sample
    Using a fuzzy regex allowing for 3 mismatches 
### 3) Mapping of the reads against the reference
    Direct mapping using the ont2d settings against the references SNP regions (51 nt)
### 4) Detemine the SNP genotype on position 26 in the mappings 


### 1. Recover q10 1D2 reads and combine with original 1D2 reads

In [None]:
# Init
#
import os, glob
import regex
from multiprocessing import Pool

rawDataDir       = '/media/genomics/nanopore/run_data/20171219_nanopore_tri-allelic-1D2_basecalled_albacore-2.1.3'
projectDir       = '/media/genomics/nanopore/projects/tri-allelic_SNPs/20171220_nanopore_1d2_analysis/Github'
resultDir        = os.path.join(projectDir, 'one_d_squared_read_analysis_q10_or_better')
pass1dsqFastqDir = os.path.join(rawDataDir, '1dsq_analysis', 'workspace', 'pass')
fail1dsqFastqDir = os.path.join(rawDataDir, '1dsq_analysis', 'workspace', 'fail')
q10R1dsqFastqDir = os.path.join(rawDataDir, '1dsq_analysis', 'workspace', 'filter_q10')
rawDataQCDir     = os.path.join(projectDir, 'raw_data_qc_albacore-2.1.3')
barcodeFile      = os.path.join(projectDir, 'barcodes_tri-allelic.csv')

nanoplot         = 'NanoPlot' # v1.8.1 (in p36 venv YG)

if not os.path.exists(resultDir):
  print('Creating directory {}'.format(resultDir))
  os.makedirs(resultDir)
else:
  print('Directory {} exists'.format(resultDir))

In [23]:
# Count reads
#

# Original 1d2 (q >= 11)
r = ! wc -l {os.path.join(pass1dsqFastqDir, '*.fastq')}
count = int(r[0].split(' ')[0]) // 4
print('Original 1D2 reads having q >= 11:  {}'.format(count))

# Recovered 1d2 (10 <= q < 11)
r = ! wc -l {os.path.join(q10R1dsqFastqDir, '*_filter_failed_1dsq_q10.fastq')}
count = int(r[0].split(' ')[0]) // 4
print('Recovered 1D2 reads having q >= 10: {}'.format(count))

Original 1D2 reads having q >= 11:  4407
Recovered 1D2 reads having q >= 10: 8405


In [24]:
# Combine with original 1D2 reads
#
combined1dsqFastq = os.path.join(q10R1dsqFastqDir, 'combined_1dsq_q10_or_better.fastq')

! cat {os.path.join(pass1dsqFastqDir, '*.fastq')} {os.path.join(q10R1dsqFastqDir, '*_filter_failed_1dsq_q10.fastq')} > {combined1dsqFastq}

r = ! wc -l {combined1dsqFastq}
count = int(r[0].split(' ')[0]) // 4
print('Combined 1D2 reads having q >= 10: {}'.format(count))

Combined 1D2 reads having q >= 10: 12812


In [28]:
# Generate quality score plots
#

# Rescued reads
! {nanoplot} -t 10 --maxlength 3000 --fastq_rich {os.path.join(q10R1dsqFastqDir, '*_filter_failed_1dsq_q10.fastq')} -o {rawDataQCDir} -p 'recovered_q10_one_d_sq_'

# Combined reads
! {nanoplot} -t 10 --maxlength 3000 --fastq_rich {combined1dsqFastq} -o {rawDataQCDir} -p 'q10_or_better_one_d_sq_'

print('Done')


Done


### 2) Split reads per sample using barcodes to identify each sample

In [31]:
# Utility functions
#
def reverseComplement(seq):
  transTab = str.maketrans('agctyrwskmdvhbAGCTYRWSKMDVHB', 'tcgarywsmkhbdvTCGARYWSMKHBDV')
  return seq.translate(transTab)[::-1]



def loadReads(fileName):
  """
  Return a dict with the sequencing reads (including quality scores) extracted from the given file
  """
  rData = {}
  with open(fileName, 'rt') as f:
    cnt = 0

    for line in f:
      cnt += 1
      if cnt % 4 == 1:
        readName = line.rstrip()
      elif cnt % 4 == 2:
        readSeq = line.rstrip()
      elif cnt % 4 == 0:
        readQual = line.rstrip()
        rData[readName] = {'s': readSeq, 'q': readQual}
        if len(rData[readName]['s'])==0 or len(rData[readName]['q'])==0:
          print('*** Partial read data: {}'.format(readName))
            
  return(rData)



def findBarcodes(readName):
  """
  Lookup all barcodes in the given read. Return a dict with the barcode hit counts for all reads.
  """
  barcodeHits = {}
  
  for barcodeName in barcodeList:
    # Lookup forward barcode sequence
    for pattern in (barcodeRE[barcodeName]['f'], barcodeRE[barcodeName]['r']):
      for match in pattern.finditer(readData[readName]['s']):
        if readName not in barcodeHits:
          barcodeHits[readName] = {}
        if barcodeName not in barcodeHits[readName]:
          barcodeHits[readName][barcodeName] = 1
        else:
          barcodeHits[readName][barcodeName] += 1
          
  return barcodeHits


In [33]:
# Load barcode data, compile regex
#
# Get the barcodes
#
barcodeList = {}
barcodeRE   = {}
maxMisMatch = 3

with open(barcodeFile, 'rt') as f:
  for line in f:
    line = line.strip()
    
    # Ignore the column header line (should start with a '#')
    if line.startswith('#'):
      continue
      
    # Store
    name, seq         = line.split(',')
    barcodeList[name] = {'f': seq, 'r': reverseComplement(seq)}
    
    barcodeRE[name] = {
      'f': regex.compile('(?e)({}){{e<={}}}'.format(barcodeList[name]['f'], maxMisMatch)),
      'r': regex.compile('(?e)({}){{e<={}}}'.format(barcodeList[name]['r'], maxMisMatch))
    }
    
print('Found {} barcodes:'.format(len(barcodeList)))
for n in sorted(barcodeList):
  print(n, barcodeList[n])

Found 5 barcodes:
NB07 {'f': 'GTGTTACCGTGGGAATGAATCCTT', 'r': 'AAGGATTCATTCCCACGGTAACAC'}
NB08 {'f': 'TTCAGGGAACAAACCAAGTTACGT', 'r': 'ACGTAACTTGGTTTGTTCCCTGAA'}
NB09 {'f': 'AACTAGGCACAGCGAGTCTTGGTT', 'r': 'AACCAAGACTCGCTGTGCCTAGTT'}
NB10 {'f': 'AAGCGTTGAAACCTTTGTCCTCTC', 'r': 'GAGAGGACAAAGGTTTCAACGCTT'}
NB12 {'f': 'CAGGTAGAAAGAAGCAGAATCGGA', 'r': 'TCCGATTCTGCTTCTTTCTACCTG'}


In [34]:
# Load 1D2 read data
#
fastqFile              = os.path.join(q10R1dsqFastqDir, 'combined_1dsq_q10_or_better.fastq')
readDataOneDSquaredQ10 = loadReads(fastqFile)

print('Loaded {} 1D2 reads'.format(len(readDataOneDSquaredQ10)))

Loaded 12812 1D2 reads


In [36]:
# Identify barcodes in 1D2 reads
#
barcodeHitData1D2 = {}
readData          = readDataOneDSquaredQ10
maxThread         = 20
pool              = Pool(maxThread)
bcHitList         = pool.map(findBarcodes, readDataOneDSquaredQ10.keys())  # list of {readname: {barcodename: hitcount}}
pool.terminate()

for d in bcHitList:
  barcodeHitData1D2.update(d)

# Print stats
barcodeStats1D2 = {}
 
for readName in barcodeHitData1D2:
  n = 'reads having {} barcodes'.format(len(barcodeHitData1D2[readName]))
  if n in barcodeStats1D2:
    barcodeStats1D2[n] += 1
  else:
    barcodeStats1D2[n] = 1
  
  for barcodeName in barcodeHitData1D2[readName]:
    if barcodeName in barcodeStats1D2:
      barcodeStats1D2[barcodeName] += 1
    else:
      barcodeStats1D2[barcodeName] = 1

for k in sorted(barcodeStats1D2.keys()):
  print('{} = {} ({:>.1f}%)'.format(k, barcodeStats1D2[k], 100*barcodeStats1D2[k]/len(readDataOneDSquaredQ10)))
  
print()

NB07 = 879 (6.9%)
NB08 = 2382 (18.6%)
NB09 = 2061 (16.1%)
NB10 = 4004 (31.3%)
NB12 = 930 (7.3%)
reads having 1 barcodes = 8862 (69.2%)
reads having 2 barcodes = 664 (5.2%)
reads having 3 barcodes = 22 (0.2%)



In [38]:
# Save 1D2 reads in fastq files per barcode (sample).  We keep only reads that have 1 type of barcode.
#
outFiles = {}

for readName in barcodeHitData1D2:
  if len(barcodeHitData1D2[readName]) == 1:
    for barcodeName in barcodeHitData1D2[readName]:
      if barcodeName not in outFiles:
        fastqFileName = os.path.join(resultDir, '{}.fastq'.format(barcodeName))
        outFiles[barcodeName] = open(fastqFileName, 'wt')
        print('Created {}'.format(fastqFileName))
      
      # Write fastq read
      outFiles[barcodeName].write('{}\n{}\n+\n{}\n'.format(readName, readDataOneDSquaredQ10[readName]['s'], readDataOneDSquaredQ10[readName]['q']))
      
# Close files (required, otherwise buffers are not always flushed to disk!)
for barcodeName in outFiles:
  outFiles[barcodeName].close()
  
print('Done')

Created /media/genomics/nanopore/projects/tri-allelic_SNPs/20171220_nanopore_1d2_analysis/one_d_squared_read_analysis_q10_or_better/NB09.fastq
Created /media/genomics/nanopore/projects/tri-allelic_SNPs/20171220_nanopore_1d2_analysis/one_d_squared_read_analysis_q10_or_better/NB10.fastq
Created /media/genomics/nanopore/projects/tri-allelic_SNPs/20171220_nanopore_1d2_analysis/one_d_squared_read_analysis_q10_or_better/NB07.fastq
Created /media/genomics/nanopore/projects/tri-allelic_SNPs/20171220_nanopore_1d2_analysis/one_d_squared_read_analysis_q10_or_better/NB08.fastq
Created /media/genomics/nanopore/projects/tri-allelic_SNPs/20171220_nanopore_1d2_analysis/one_d_squared_read_analysis_q10_or_better/NB12.fastq
Done


In [39]:
# Count 1D2 reads per barcode (sample)
#
barcodeList  = ['NB07', 'NB08', 'NB09', 'NB10', 'NB12']
barcodeReads = {}
totalReads   = len(readDataOneDSquaredQ10)

print('Total reads: {}'.format(totalReads))

for barcodeName in barcodeList:
  fastqFileName             = os.path.join(resultDir, '{}.fastq'.format(barcodeName))
  barcodeReads[barcodeName] = loadReads(fastqFileName)
  readCount                 = len(barcodeReads[barcodeName])
  
  print('{}: {} ({} %) reads'.format(barcodeName, readCount, 100*readCount/totalReads))

Total reads: 12812
NB07: 823 (6.423665313768343 %) reads
NB08: 1742 (13.59662816109897 %) reads
NB09: 1943 (15.165469871995004 %) reads
NB10: 3846 (30.018732438339057 %) reads
NB12: 508 (3.965032781767093 %) reads


### 3) Mapping of the reads against the reference

In [40]:
snpRefFile    = os.path.join(projectDir, 'triallelic_snp_regions.fasta')
sampleMap     = {
  'NB07': '9948',
  'NB08': '9947',
  'NB09': '2800',
  'NB10': 'G59',
  'NB12': 'G62'   # G56 in e-mail Senne
}

bwa        = '/opt/tools/bwa-0.7.15'
samtools   = '/opt/tools/samtools-1.3.1'
bcftools   = '/opt/tools/bcftools-1.3.1'

In [41]:
# Map amplicons to reference SNP region sequences (51 nt), call variants
#
for sampleName in sorted(sampleMap):
  # Map amplicons
  fastqFile = os.path.join(resultDir, '{}.fastq'.format(sampleName))
  fastaFile = os.path.join(resultDir, '{}_noqual.fasta'.format(sampleName))
  bamFile   = fastaFile.replace('.fasta', '_direct_mapping.bam')
  vcfFile   = fastaFile.replace('.fasta', '_direct_mapping.vcf')
  
  # Convert fastq to fasta
  ! paste - - - - < {fastqFile} | cut -f 1,2 | sed 's/^@/>/' | tr "\t" "\n" > {fastaFile}
  
  # Map
  ! {bwa} mem -t 10 -x ont2d {snpRefFile} {fastaFile} | {samtools} view -Sb - | {samtools} sort -o {bamFile} -
  ! {samtools} index {bamFile}
  
  # Variant calling
  !{samtools} mpileup -d 100000 -Buf {snpRefFile} -t AD {bamFile} | {bcftools} call -V indels -m - > {vcfFile}

print('Done')

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 823 sequences (651745 bp)...
[M::mem_process_seqs] Processed 823 reads in 0.384 CPU sec, 0.045 real sec
[main] Version: 0.7.15-r1140
[main] CMD: /opt/tools/bwa-0.7.15 mem -t 10 -x ont2d /media/genomics/nanopore/projects/tri-allelic_SNPs/20171220_nanopore_1d2_analysis/triallelic_snp_regions.fasta /media/genomics/nanopore/projects/tri-allelic_SNPs/20171220_nanopore_1d2_analysis/one_d_squared_read_analysis_q10_or_better/NB07_noqual.fasta
[main] Real time: 0.884 sec; CPU: 0.389 sec
[mpileup] 1 samples in 1 input files
Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid
[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1742 sequences (1100512 bp)...
[M::mem_process_seqs] Processed 1742 reads in 1.738 CPU sec, 1.128 real sec
[main] Version: 0.7.15-r1140
[main] CMD: /opt/tools/bwa-0.7.15 mem -t 10 -x ont2d /media/genomics/nanopore/projects/tri-allelic_SNPs/20171220_nanopore_1d2_analysis/

### 4) Detemine the SNP genotype on position 26 in the mappings 

In [42]:
def genotype(vcfFile, snpPos):
  """
  Extract genotype data at given position from a given vcf file.
  """
  genotypeData = {}
  
  with open (vcfFile, 'rt') as inFile:
    for line in inFile:
      if line.startswith('#'): continue
      
      snpName, pos, id, ref, alt, qual, filt, info, form, formValues = line.rstrip().split()
      
      if int(pos) == snpPos:
        k        = form.split(':')
        v        = formValues.split(':')         
        formData = {}
        
        for i in range(len(k)):
          formData[k[i]] = v[i]
          
        infoData = {}
        for t in info.split(';'):
          k,v = t.split('=')
          infoData[k] = v
          
        alleles = ref
        if alt != '.':
          alleles += ''.join(alt.split(','))
          
        depths                = [int(d) for d in formData['AD'].split(',')]
        gt                    = [alleles[int(i)] for i in formData['GT'].split('/') ]
        genotypeData[snpName] = {'pos': pos, 'alleles': {'A': 0, 'G': 0, 'C': 0, 'T':0}, 'genotype': '/'.join(gt), 'depth': int(infoData['DP'])}
        
        for i in range(len(alleles)):
          genotypeData[snpName]['alleles'][alleles[i]] = depths[i]
        
  return genotypeData


In [43]:
# Get all sample genotypes
#
excludeSnp = [] #['rs9274701'] # Amplicon is found, but content is not as expected
genotypes  = {}

for sampleName in sorted(sampleMap, key=lambda k: sampleMap[k]):
  print("SNP's for sample {} ({})".format(sampleName, sampleMap[sampleName]))
  genotypes[sampleName] = {}
  vcfFile               = os.path.join(resultDir, '{}_noqual_direct_mapping.vcf'.format(sampleName))
  
  d = genotype(vcfFile, 26)
  
  for snpName in sorted(d):
    t = sum(d[snpName]['alleles'].values())
    
    # Likely genotype
    gg = {}
    vv = sorted([int(c) for c in d[snpName]['alleles'].values()], reverse=True)
    
    for g, c in d[snpName]['alleles'].items():
      if c not in gg:
        gg[c] = [g]
      else:
        gg[c].append(g)
    # Second allele must be at least 10% of main allele or it is ignored.
    # Third and fourth alleles are always ignored if present.
    #if len(vv) > 1 and vv[1] > vv[0]/10:
    if len(vv) > 1 and vv[1] > vv[0]/4:
      if len(gg[vv[0]]) > 1:
        aa = ''.join(gg[vv[0]])
      else:
        aa = gg[vv[0]][0] + gg[vv[1]][0]
    else:
      aa = gg[vv[0]][0] + gg[vv[0]][0]
    
    o = '  {:<14}  '.format(snpName)
    
    for g in sorted(d[snpName]['alleles']):
      o += '  {}:{:<5}'.format(g, d[snpName]['alleles'][g])
    for g in sorted(d[snpName]['alleles']):
      if t > 0:
        o += '  {}:{:>3}%'.format(g, round(100*d[snpName]['alleles'][g]/t))
      else:
        o += '  {}:{:>3}%'.format(g, '???')
    o += '    {:>5}/{:<5} reads'.format(t, d[snpName]['depth'])
    o += '    {}'.format(''.join(sorted(aa)))
    genotypes[sampleName][snpName] = ''.join(sorted(aa))
    print(o)
  
  print('')
  

SNP's for sample NB09 (2800)
  rs1008686         A:0      C:0      G:0      T:293    A:  0%  C:  0%  G:  0%  T:100%      293/297   reads    TT
  rs1112534         A:0      C:152    G:1      T:0      A:  0%  C: 99%  G:  1%  T:  0%      153/154   reads    CC
  rs17287498        A:139    C:3      G:183    T:0      A: 43%  C:  1%  G: 56%  T:  0%      325/326   reads    AG
  rs2032582         A:147    C:0      G:15     T:165    A: 45%  C:  0%  G:  5%  T: 50%      327/327   reads    AT
  rs2069945         A:0      C:12     G:93     T:0      A:  0%  C: 11%  G: 89%  T:  0%      105/106   reads    GG
  rs2307223         A:244    C:0      G:0      T:0      A:100%  C:  0%  G:  0%  T:  0%      244/249   reads    AA
  rs2853525         A:0      C:152    G:0      T:159    A:  0%  C: 49%  G:  0%  T: 51%      311/312   reads    CT
  rs3091244         A:0      C:205    G:3      T:0      A:  0%  C: 99%  G:  1%  T:  0%      208/212   reads    CC
  rs34741930        A:4      C:217    G:0      T:0      A: 

In [44]:
# Genotype overview
#
h = 'SNP           '
snpNames = set()

sortedSampleNames = sorted(sampleMap, key=lambda k: sampleMap[k])

for sampleName in sortedSampleNames:
  h += ' {:>10}'.format(sampleMap[sampleName])
  
  for snpName in genotypes[sampleName]:
    snpNames.add(snpName)
print(h)

for snpName in sorted(snpNames):
  r = '{:<14}'.format(snpName)
  
  for sampleName in sortedSampleNames:
    if snpName in genotypes[sampleName]:
      aa = ''.join(sorted(genotypes[sampleName][snpName]))
    else:
      aa = ''
    r += ' {:>10}'.format(aa)
    
    
  print(r)


SNP                  2800       9947       9948        G59        G62
rs1008686              TT         TT         TT         AT         AA
rs1112534              CC         CT         CC         CC         TT
rs17287498             AG         GG         GG         GT         GG
rs2032582              AT         GT         GG         AT         GG
rs2069945              GG         CC         GG         AC         CG
rs2307223              AA         AT         AT         AG         AA
rs2853525              CT         CT         CC         TT         TT
rs3091244              CC         CT         AT         CC         CC
rs34741930             CC         CC         CC         CC         CC
rs35528968             AA         AA         AA         AA         AA
rs356167               AG         CG         AG         CG         CG
rs433342               GG         CG         GG         AG         GG
rs5030240              GT         CG         GG         GT         CG
rs727241            