# Trim Primers

Given a single-sequence FASTQ file and primer ID, updates the file to contain only the sequence between the primers.

## Setup

In [2]:
%load_ext autoreload
%autoreload 1
%aimport RCUtils

In [3]:
from Bio import Align
from Bio import SeqIO
import matplotlib_inline.backend_inline
import os
import pandas as pd
import RCUtils
import random
import glob

qPCRprimers = RCUtils.readPrimers("HRVPrimers.fasta", display=True)

def getPrimers(name):
    return filter(lambda p: p.name.startswith(name), qPCRprimers)

from pathlib import Path

def trimAllPrimers(pathGlob,primers):
    for path in glob.glob(pathGlob):
        trimPrimers(path, primers)

def trimPrimers(fastQPath, primers):
    record = SeqIO.read(fastQPath, "fastq")
    hits = RCUtils.computePrimerHits(record, primers)
    print()
    print(f"{fastQPath}, len={len(record.seq)}")
    for hit in hits:
        print(f" {hit.primer.name} {hit.start}-{hit.end} ({hit.mr*100:.0f}%)")
    if len(hits) < 2:
        print("  ERROR: didn't get 2 primer hits")
    elif hits[0].end > hits[-1].start:
        print("  ERROR: hits in wrong order")
    else:
        start = hits[0].end
        end = hits[-1].start
        trimmed = record[start:end]
        print(f" trimmed to {start}-{end} len={len(trimmed.seq)}")
        SeqIO.write(trimmed, fastQPath+".trimmed", "fastq")
    

Reading primers: HRVPrimers.fasta
  ENTng-f (2 variations)
  ENTng-r
  ENTng-p (8 variations)
  ENTrc-f1
  ENTrc-f2
  ENTrc-r
  HRVma-f
  HRVma-r
  HRVma-p
  HRVka5-f
  HRVka5-ro
  HRVka5-ri
  HRVbo-f (4 variations)
  HRVbo-r
  HRVbo-p
Read 26 primers


In [27]:
trimPrimers("myseqs/S28-RVA-23.fastq", getPrimers("ENTrc"))
trimPrimers("myseqs/S44-RVA-56.fastq", getPrimers("ENTrc"))
trimPrimers("myseqs/S48-RVC-1.fastq", getPrimers("ENTrc"))
trimPrimers("myseqs/S59-RVA-77.fastq", getPrimers("HRVka5"))



myseqs/S28-RVA-23.fastq, len=392
 ENTrc-f1 1-22 (100%)
 ENTrc-r 370-391 (100%)
 trimmed to 22-370 len=348

myseqs/S44-RVA-56.fastq, len=417
 ENTrc-f1 23-44 (95%)
 ENTrc-r 394-415 (100%)
 trimmed to 44-394 len=350

myseqs/S48-RVC-1.fastq, len=397
 ENTrc-f1 1-22 (100%)
 ENTrc-r 374-395 (100%)
 trimmed to 22-374 len=352

myseqs/S59-RVA-77.fastq, len=437
 HRVka5-f 16-37 (100%)
 HRVka5-ri 291-309 (100%)
 HRVka5-ro 388-411 (100%)
 trimmed to 37-388 len=351


In [None]:
trimPrimers("myseqs/S142-RVA-62.fastq", getPrimers("ENTrc"))
trimPrimers("myseqs/S147-RVC-44.fastq", getPrimers("ENTrc"))
trimPrimers("myseqs/S148-RVA-68.fastq", getPrimers("ENTrc"))
trimPrimers("myseqs/S153-RVB-27.fastq", getPrimers("ENTrc"))



myseqs/S142-RVA-62.fastq, len=346
  ERROR: didn't get 2 primer hits

myseqs/S147-RVC-44.fastq, len=338
  ERROR: didn't get 2 primer hits

myseqs/S148-RVA-68.fastq, len=349
  ERROR: didn't get 2 primer hits

myseqs/S153-RVB-27.fastq, len=333
 HRVka5-ri 60-78 (100%)
  ERROR: didn't get 2 primer hits
