# 1. Source

Click on the link to go to the source web page of **Rosalind**: [Read Quality Distribution](https://rosalind.info/problems/phre/)

 **Problem**
 
 ![FASTQ Format Introduction](phre_problem.png "FASTQ Format Introduction")

**Sample Dataset**

28<br>
@Rosalind_0041<br>
GGCCGGTCTATTTACGTTCTCACCCGACGTGACGTACGGTCC<br>
+<br>
6.3536354;.151<211/0?::6/-2051)-*"40/.,+%)<br>
@Rosalind_0041<br>
TCGTATGCGTAGCACTTGGTACAGGAAGTGAACATCCAGGAT<br>
+<br>
AH@FGGGJ<GB<<9:GD=D@GG9=?A@DC=;:?>839/4856<br>
@Rosalind_0041<br>
ATTCGGTAATTGGCGTGAATCTGTTCTGACTGATAGAGACAA<br>
+<br>
@DJEJEA?JHJ@8?F?IA3=;8@C95=;=?;>D/:;74792.

**Sample Output**

1

# 2. Workspace

In [1]:
# write and extract qc quality scores

qc_scores = list()

with open('phre_test.txt', 'r') as file:
    threshold = file.readline().rstrip()
    while True:
        if len(file.readline()) == 0:
            break
        file.readline()
        file.readline()
        qc_score = file.readline().rstrip()
        qc_scores.append(qc_score)
        
# print

print('Threshold:', threshold, '\n')
print('QC Scores:', *qc_scores, sep = '\n')

Threshold: 28 

QC Scores:
6.3536354;.151<211/0?::6/-2051)-*"40/.,+%)
AH@FGGGJ<GB<<9:GD=D@GG9=?A@DC=;:?>839/4856
@DJEJEA?JHJ@8?F?IA3=;8@C95=;=?;>D/:;74792.


In [2]:
qc_scores[0]

'6.3536354;.151<211/0?::6/-2051)-*"40/.,+%)'

In [3]:
# need to convert each character in qc scores into a numeric value
# fastqc - phred33 scala

# write a function for that conversion

def phredToNum(qcChar):
    numeric_qc = ord(qcChar) - 33
    return numeric_qc

In [4]:
# test the func

phredToNum('A')

32

In [5]:
# another function for calculation of average qc of a sequence

def meanQC(qcSequence):
    qcNums = [phredToNum(qcChar) for qcChar in qcSequence]
    return sum(qcNums) / len(qcNums)

In [6]:
# test the func

meanQC('6.3536354;.151<211/0?::6/-2051)-*"40/.,+%)') # the first seq in the input file

16.261904761904763

In [7]:
meanQC('AH@FGGGJ<GB<<9:GD=D@GG9=?A@DC=;:?>839/4856') # the first seq in the input file

29.80952380952381

In [8]:
meanQC('@DJEJEA?JHJ@8?F?IA3=;8@C95=;=?;>D/:;74792.') # the first seq in the input file

28.904761904761905

In [9]:
# so loop over qc_scores to see how many of them has qc avg below threshold

counter = 0
threshold = 28

for qcSeq in qc_scores:
    if meanQC(qcSeq) < threshold:
        counter += 1
        
# print counter

print(counter)

1


In [10]:
# we can use biopython's fastqgeneraliterator module to parse fastq and extract qc seqs

from Bio.SeqIO.QualityIO import FastqGeneralIterator

qc_scores = list()

file = open('phre_test.txt', 'r')

threshold = file.readline().rstrip()
for triplet in FastqGeneralIterator(file):
    identifier, sequence, quality = triplet
    qc_scores.append(quality)
    
file.close()

# print

print(threshold)
print(qc_scores)

28
['6.3536354;.151<211/0?::6/-2051)-*"40/.,+%)', 'AH@FGGGJ<GB<<9:GD=D@GG9=?A@DC=;:?>839/4856', '@DJEJEA?JHJ@8?F?IA3=;8@C95=;=?;>D/:;74792.']


In [11]:
# and we can use biopython's seqio module for both parse and qc calculation

from Bio import SeqIO

# first read all the lines of the input

with open('phre_test.txt', 'r') as file:
    lines = file.readlines()

# write starting from the second line in a new file - a pure fastq file (first line was just a number)

with open('phre_test_2.fastq', 'w') as file:
    file.writelines(lines[1:])

# assign the first number the variable:threshold and print it

threshold = int(lines[0].strip()) # store the first line as threshold
print('Threshold:', threshold)
del lines # delete all the lines - no need them anymopre since we wrote them to the second file

##

counter = 0

# loop over file lines and parse that fastq and extract qc scores

for record in SeqIO.parse('phre_test_2.txt', 'fastq'):
    qc_scores = record.letter_annotations['phred_quality']
    #print(qc_scores)
    if sum(qc_scores) / len(qc_scores) < int(threshold):
        counter += 1
        
# print counter

print('Counter / Answer:', counter)

Threshold: 28
Counter / Answer: 1


### --A Simple Speed Test

In [12]:
# create a larger file
# 500k dna sequences

import random
random.seed(10)

test_file = open('phre_speed_test.fastq', 'w')

test_file.write('20\n')

for i in range(500000):
    test_file.write(f'@Rosalind_sequence_name{i}\n')
    test_file.write(f"{''.join(random.choices('ACGT', k = 150))}\n")
    test_file.write('+\n')
    test_file.write(f"{''.join(random.choices('.,A3;5:>=0+2*1!9D47#F<-8@6B', k = 150))}\n")
    
test_file.close()

In [13]:
# option 1
# own parser
# own functions for qc

In [14]:
%%timeit -n 10

qc_scores = list()

with open('phre_speed_test.fastq', 'r') as file:
    threshold = int(file.readline().rstrip())
    while True:
        if len(file.readline()) == 0:
            break
        file.readline()
        file.readline()
        qc_score = file.readline().rstrip()
        qc_scores.append(qc_score)
        
counter = 0

for qcSeq in qc_scores:
    if meanQC(qcSeq) < threshold:
        counter += 1

7.9 s ± 5.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
# option 2
# biopython parser
# own functions for qc

In [16]:
%%timeit -n 10

from Bio.SeqIO.QualityIO import FastqGeneralIterator

qc_scores = list()

file = open('phre_speed_test.fastq', 'r')
threshold = int(file.readline().rstrip())
for triplet in FastqGeneralIterator(file):
    identifier, sequence, quality = triplet
    qc_scores.append(quality)    
file.close()

counter = 0

for qcSeq in qc_scores:
    if meanQC(qcSeq) < threshold:
        counter += 1

8.16 s ± 8.84 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
# option 3
# biopython parser
# biopython annotation for qc

In [18]:
%%timeit -n 10

from Bio import SeqIO

with open('phre_speed_test.fastq', 'r') as file:
    lines = file.readlines()

with open('phre_speed_test_2.fastq', 'w') as file:
    file.writelines(lines[1:])

threshold = int(lines[0].strip()) 
del lines 

counter = 0

for record in SeqIO.parse('phre_speed_test_2.fastq', 'fastq'):
    qc_scores = record.letter_annotations['phred_quality']
    if sum(qc_scores) / len(qc_scores) < int(threshold):
        counter += 1

5.73 s ± 37.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [28]:
# implementation1: option1
# option 3 is better with 2.2 s than option 1
# however duplication of the input file may consume much more memory

# 3. Implementation

In [20]:
def phredToNum(qcChar):
    
    '''
    input
        an ascii char
    process
        converts char to a numerical value using phred -33 scale
    output
        an integer
    '''
    
    # convert char to number based on ascii table
    numeric_qc = ord(qcChar) - 33
    
    # return
    return numeric_qc

In [21]:
def meanQC(qcSequence):
    
    '''
    input
        a sequence of qc scores as ascii chars
    process
        calculates mean of qc_scores
    output
        a float
    '''
    
    # mean calculation of all numbers
    qcNums = [phredToNum(qcChar) for qcChar in qcSequence]
    result = sum(qcNums) / len(qcNums)
    
    # return
    return result

In [22]:
def phreFileParser(filename):
    
    '''
    input
        a file containing a threshold and fastq records
    process
        calculates the numbers of records whose mean qc is below threshold
    output
        prints answer to console
        writes answer to a file
    '''
    
    #initiate an empty list to keep qc score sequences, list of 4th lines of each record
    qc_scores = list()

    # open and parse file
    with open(filename, 'r') as file:
        
        # take threshold
        threshold = int(file.readline().rstrip())
        
        # loop over for rest of file
        while True:
            if len(file.readline()) == 0:
                break
            file.readline()
            file.readline()
            qc_score = file.readline().rstrip()
            qc_scores.append(qc_score)
    
    # return
    return threshold, qc_scores

In [23]:
def phre(filename):
    
    '''
    input
        a file containing a threshold and fastq records
    process
        parses file and extracts an integer as threshold and a list of qc score sequences
        compares mean qc scores of each sequence
    output
        prints how many of sequences have mean qc below threshold
        writes asnwer in a file
    '''
    
    # get threshold and qc score list
    threshold, qc = phreFileParser(filename)
    
    # loop over list:qc and check mean of each if they are less than threshold:t
    counter = 0
    for qcSeq in qc:
        if meanQC(qcSeq) < threshold:
            counter += 1
            
    # print answer to console
    print('\n\x1B[1mANSWER\x1B[0m\n______\n')
    print(f'{counter}')
    
    # open file and write answer
    file = open(f'{filename.split(".")[0]}_answer.txt', 'w')
    file.write(f'{counter}')
    file.close()
    print('\n\n#! The answer has been written into the file:',
          f'\x1B[1m./{filename.split(".")[0]}_answer.txt\x1B[0m\n')

# 4. Execution

In [24]:
phre('phre_test.txt')


[1mANSWER[0m
______

1


#! The answer has been written into the file: [1m./phre_test_answer.txt[0m



In [25]:
phre('phre_speed_test.fastq')


[1mANSWER[0m
______

117175


#! The answer has been written into the file: [1m./phre_speed_test_answer.txt[0m



In [26]:
phre('rosalind_phre_1_dataset.txt')


[1mANSWER[0m
______

28


#! The answer has been written into the file: [1m./rosalind_phre_1_dataset_answer.txt[0m



In [29]:
phre('rosalind_phre.txt')


[1mANSWER[0m
______

27


#! The answer has been written into the file: [1m./rosalind_phre_answer.txt[0m



<p style='text-align: right;'>
    <!--<b><font size = '5'>Contact</font></b><br>-->
    <b>Orcun Tasar</b><br>
    <i>Bioinformatician / Data Scientist</i><br>
    orcuntasar |at@| ogr.iu.edu.tr<br>
    tasar.orcun |at@| gmail.com<br>
    <a href = 'https://www.linkedin.com/in/orçun-taşar-7b5992a1/'>Linkedin</a> | <a href = 'https://www.instagram.com/shatranuchor/'>Instagram</a>
</p>