## Analysis of Illumina reads

### 1) Merge overlapping reads
    The sequenced amplicons are shorter than the read length, so we will have a lot of overlapping pairs.
### 2) Identify and extract full amplicons per SNP for all samples
    Both primers have a perfect match in the correct orientation
    the resulting amplicon length is exactly as predicted

### 3) Detemine the SNP genotype on position 26 in the mappings 



### Notes:
* Some SNP flanking regions contain other SNP's causing a hit miss when searching a perfect match.
* We need to isolate full amplicons using the primers (fuzzy regex for Nanopore, perfect match for Illumina) and either:
 * call the snp at it's expected coordinate/distance from the PCR primer(s) (should work with illumina)
 * Map against reference amplicons, generate vcf (all positions), check variant at expected pos (all alts)


### 1) Merge overlapping reads

In [None]:
 Init
#
import os, glob

origFastqDir = '/media/genomics/nanopore/run_data/20171018_Illumina_Tri-allelic/20171013_Senne-49554618'
outputDir    = '/media/genomics/nanopore/run_data/20171018_Illumina_Tri-allelic/20171019_merged_overlapping_reads'

pear         = '/opt/tools/PEAR'  # v0.9.10

if not os.path.exists(outputDir):
    os.makedirs(outputDir)

In [None]:
! ls {origFastqDir}

In [None]:
sampleList = ['2800', '9947', '9948', 'G59', 'G62']

for sampleName in sampleList:
  r1File    = os.path.join(origFastqDir, '{}_S*R1_001.fastq'.format(sampleName))
  r2File    = os.path.join(origFastqDir, '{}_S*R2_001.fastq'.format(sampleName))
  outPrefix = os.path.join(outputDir, '{}_pear'.format(sampleName))
  
  !{pear} -f {r1File} -r {r2File} -o {outPrefix}

### 2) Identify and extract full amplicons per SNP for all samples


In [None]:
# Init
#
import os, glob

origFastqDir     = '/media/genomics/nanopore/run_data/20171018_Illumina_Tri-allelic/20171013_Senne-49554618'
outputDir        = '/media/genomics/nanopore/run_data/20171018_Illumina_Tri-allelic/20171019_merged_overlapping_reads'
refAmpliconFasta = '/home/ygansemans/projects/triallelic_snp_amplicons.fasta'
pcrPrimerFile    = '/home/ygansemans/projects/triallelic_snp_pcr_primers.csv'

sampleList       = ['2800', '9947', '9948', 'G59', 'G62']

In [None]:
# Utility functions
#
def reverseComplement(seq):
  """
  Return the reverse complement of a given nucleotide sequence
  """
  transTab = str.maketrans('agctyrwskmdvhbAGCTYRWSKMDVHB', 'tcgarywsmkhbdvTCGARYWSMKHBDV')
  return seq.translate(transTab)[::-1]

In [None]:
# Load PCR primers
#
pcrPrimerData = {}

with open(pcrPrimerFile, 'rt') as inFile:
  for line in inFile:
    if line.startswith('SNP'): continue  # header line
    
    pName, pFor, pRev, ampliconSize, snpPos = line.rstrip().split(',')
    pcrPrimerData[pName[4:]] = {'f': pFor, 'r': pRev, 'l': int(ampliconSize), 'p': int(snpPos)-1}
    
print('Got data for {} amplicons'.format(len(pcrPrimerData)))

In [None]:
# Find and extract the amplicons per sample
#
ampliconData = {}
hitCountData = {}
reportSNP    = 'rs9274701'

for sampleName in sampleList:
  fastqFile                = os.path.join(outputDir, '{}_pear.assembled.fastq'.format(sampleName))
  outFastaFile             = os.path.join(outputDir, '{}_snp_amplicons.fasta'.format(sampleName))
  ampliconData[sampleName] = {}
  hitCountData[sampleName] = {}
  
  with open(fastqFile, 'rt') as inFile, open(outFastaFile, 'wt') as outFile:
    cnt = 0
    
    for line in inFile:
      cnt += 1
      line = line.rstrip()
      
      if cnt % 4 == 1:
        readName = line
      elif cnt % 4 == 2:
        readSeq = line
        
        # Find primers (exact match)
        for snpName in sorted(pcrPrimerData):
          ampliconSeq = None
          
          if snpName not in hitCountData[sampleName]:
            hitCountData[sampleName][snpName] = 0
            ampliconData[sampleName][snpName] = []
          
          # Check on sense strand
          if pcrPrimerData[snpName]['f'] in readSeq and reverseComplement(pcrPrimerData[snpName]['r']) in readSeq:
            i1 = readSeq.index(pcrPrimerData[snpName]['f'])
            i2 = readSeq.index(reverseComplement(pcrPrimerData[snpName]['r']))
            ampliconSeq = readSeq[i1:i2]+reverseComplement(pcrPrimerData[snpName]['r'])
            
          # Check on anti-sense strand
          elif pcrPrimerData[snpName]['r'] in readSeq and reverseComplement(pcrPrimerData[snpName]['f']) in readSeq:
            i1 = readSeq.index(pcrPrimerData[snpName]['r'])
            i2 = readSeq.index(reverseComplement(pcrPrimerData[snpName]['f']))
            ampliconSeq = readSeq[i1:i2]+reverseComplement(pcrPrimerData[snpName]['f'])
            ampliconSeq = reverseComplement(ampliconSeq)
          
          # Save only if amplicon passes length restrictions
          if ampliconSeq and len(ampliconSeq) == pcrPrimerData[snpName]['l']:
            outFile.write('>{}_{}\n'.format(snpName, readName))
            outFile.write(ampliconSeq + '\n')
            hitCountData[sampleName][snpName] += 1
            ampliconData[sampleName][snpName].append(ampliconSeq)
            
            if reportSNP and snpName == reportSNP:
              print('{}: {}'.format(snpName, ampliconSeq))
              print('')
              reportSNP = None

# Display results
for sampleName in sorted(sampleList):
  print(sampleName)
  for snpName in sorted(pcrPrimerData):
    print('  {:>10}: {}'.format(snpName, hitCountData[sampleName][snpName]))
  
  print('')

### 3) Detemine the SNP genotype on position 26 in the mappings 

In [None]:
# Get sample genotypes
#
excludeSnp = [] #['rs9274701'] # Amplicon is found, but content is not as expected
genotypes  = {}

for sampleName in sorted(sampleList):
  print("SNP's for sample {}".format(sampleName))
  genotypes[sampleName] = {}
  
  for snpName in sorted(ampliconData[sampleName]):
    if snpName in excludeSnp:
      print('  {:>10}'.format(snpName))
      continue
    
    genotype = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
    snpPos   = pcrPrimerData[snpName]['p']
    
    for amplicon in ampliconData[sampleName][snpName]:
      genotype[amplicon[snpPos]] += 1
      
    o = '  {:>10}  '.format(snpName)
    t = sum(genotype.values())
    
      
    if t:
      for g in sorted(genotype):
        o += '  {}:{:<5}'.format(g, genotype[g])
        
      o += '      '
      for g in sorted(genotype):
        o += '  {}:{:>3}%'.format(g, round(100*genotype[g]/t))
        
      o += '    {:>5} reads'.format(t)
    
      # Likely genotype
      gg = {}
      vv = sorted([int(c) for c in genotype.values()], reverse=True)
      for g, c in genotype.items():
        if c not in gg:
          gg[c] = [g]
        else:
          gg[c].append(g)
      # Second allele must be at least 10% of main allele or it is ignored.
      # Third and fourth alleles are always ignored if present.
      if len(vv) > 1 and vv[1] > vv[0]/10:
        aa = gg[vv[0]][0] + gg[vv[1]][0]
      else:
        aa = gg[vv[0]][0] + gg[vv[0]][0]
      o += '    {}'.format(''.join(sorted(aa)))
      
      genotypes[sampleName][snpName] = sorted(aa)
    
    print(o)
  print('')
  

In [None]:
# Genotype overview
#
h = 'SNP       '
for sampleName in sorted(genotypes):
  h += ' {:>10}'.format(sampleName)
print(h)

for snpName in sorted(pcrPrimerData):
  r = '{:>10}'.format(snpName)
  
  for sampleName in sorted(genotypes):
    if snpName in genotypes[sampleName]:
      aa = ''.join(sorted(genotypes[sampleName][snpName]))
    else:
      aa = ''
    r += ' {:>10}'.format(aa)
    
    
  print(r)
