 Cutadapt:
 
 http://cutadapt.readthedocs.io/en/stable/guide.html

In [1]:
import os
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
untrimmed_seqs_path = '../../Yeast2HybridData/NextSeq-2022-01-10-L68/Untrimmed/'
trimmed_seqs_path = '../../Yeast2HybridData/NextSeq-2022-01-10-L68/Trimmed/'

In [3]:
if not os.path.exists(untrimmed_seqs_path):
    os.makedirs(untrimmed_seqs_path)
    os.makedirs(trimmed_seqs_path)

## Remove adaptors

#### list of fastq filenames

In [4]:
#files = [filename for filename in os.listdir(untrimmed_seqs_path) if filename[-6:] == '.fastq']
files = [filename for filename in os.listdir(untrimmed_seqs_path) if filename[-3:] == '.gz']
files = sorted(files)

In [5]:
files

['L68_HIS_S2_R1_001.fastq.gz', 'L68_TRP_S1_R1_001.fastq.gz']

### Define sequences surrounding the binders and the barcodes

In [6]:

# This version is for 71/237bp read1/read2

barcode1_prefix = "CGGGGCGGGGTTATTACGACAT"
barcode1_suffix = "CCTCCAGACAGCGTTCCGTTA"


### Indexes

In [7]:

files_dict = {}

for f in files:
    
    files_dict[f] = [barcode1_prefix, barcode1_suffix]
    

In [8]:
files_dict

{'L68_HIS_S2_R1_001.fastq.gz': ['CGGGGCGGGGTTATTACGACAT',
  'CCTCCAGACAGCGTTCCGTTA'],
 'L68_TRP_S1_R1_001.fastq.gz': ['CGGGGCGGGGTTATTACGACAT',
  'CCTCCAGACAGCGTTCCGTTA']}

### loop through and trim each fastq file with cutadapt

In [9]:
for path, affix in files_dict.items():
    # input and output paths
    untrimmed = untrimmed_seqs_path + path
    trimmed = trimmed_seqs_path + path
    
    
    # just trim off the 5' end
    if len(affix) == 1:
        suffix = affix[0]
        #!~/.local/bin/cutadapt -g {suffix} -m 17 -M 23 -e 0.1 -o {trimmed} {untrimmed}
        !cutadapt -g {suffix} -m 7 -M 23 -e 0.1 -o {trimmed} {untrimmed}
    
    # trim off before and after the barcode
    else:
        prefix = affix[0]
        suffix = affix[1]
        #!~/.local/bin/cutadapt -g {prefix}...{suffix} -m 17 -M 23 -e 0.1 -o {trimmed} {untrimmed}
        !cutadapt -g {prefix}...{suffix} -m 17 -M 23 -e 0.1 -o {trimmed} {untrimmed}
        
# -g marks linked adaptors, neither of which are 'anchored'
# -m 5 -- this parameter tells cutadapt to throw away reads that have yield a sequence <5 nucleotides long.
# If you don't do this, you sometimes end up with empty rows, which fucks up later processing using pandas -- pandas'
# read_csv function automatically ignores empty rows, messing up the spacing (the names are kept) and making
# life difficult.

This is cutadapt 1.18 with Python 3.6.9
Command line parameters: -g CGGGGCGGGGTTATTACGACAT...CCTCCAGACAGCGTTCCGTTA -m 17 -M 23 -e 0.1 -o ../../Yeast2HybridData/NextSeq-2022-01-10-L68/Trimmed/L68_HIS_S2_R1_001.fastq.gz ../../Yeast2HybridData/NextSeq-2022-01-10-L68/Untrimmed/L68_HIS_S2_R1_001.fastq.gz
Processing reads on 1 core in single-end mode ...
Finished in 213.92 s (26 us/read; 2.35 M reads/minute).

=== Summary ===

Total reads processed:               8,375,754
Reads with adapters:                 8,339,898 (99.6%)
Reads that were too short:               8,966 (0.1%)
Reads that were too long:               44,765 (0.5%)
Reads written (passing filters):     8,322,023 (99.4%)

Total basepairs processed: 2,052,059,730 bp
Total written (filtered):    166,431,486 bp (8.1%)

=== Adapter 2 ===

Sequence: CGGGGCGGGGTTATTACGACAT...CCTCCAGACAGCGTTCCGTTA; Type: linked; Length: 22+21; 5' trimmed: 8339898 times; 3' trimmed: 8339898 times

No. of allowed errors:
0-9 bp: 0; 10-19 bp: 1; 20-22 