# 1. Source

Click on the link to go to the source web page of **Rosalind**: [Computing GC Content](https://rosalind.info/problems/gc/)

**Problem**

![Computing GC Content](gc_problem.png 'Computing GC Content')

**Sample Dataset**

\>Rosalind_6404<br>
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC<br>
TCCCACTAATAATTCTGAGG<br>
\>Rosalind_5959<br>
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT<br>
ATATCCATTTGTCAGCAGACACGC<br>
\>Rosalind_0808<br>
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC<br>
TGGGAACCTGCGGGCAGTAGGTGGAAT<br>

**Sample Output**

Rosalind_0808<br>
60.919540

# 2. Workspace

In [1]:
# read the input file 

# initiate a sequence dict

sequence_dict = dict()

# populate this dictionary with the data in the input file

with open('gc_test_1.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()

In [2]:
# look into a couple of k:v pairs as identifier:sequence

sequence_dict

{'Rosalind_6404': 'CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG',
 'Rosalind_5959': 'CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC',
 'Rosalind_0808': 'CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGACTGGGAACCTGCGGGCAGTAGGTGGAAT'}

In [3]:
sequence_dict['Rosalind_0808']

'CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGACTGGGAACCTGCGGGCAGTAGGTGGAAT'

In [4]:
# write a sub-function to calculate gc content

def gc_calc(sequence):
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    calculation = 100 * (countG + countC) / totalL
    return calculation

In [5]:
# test out this function

gc_calc('AAAAACCCCCTTTTTGGGGG')

50.0

In [6]:
# after having the sequence dictionary, gc content can be calculated for each sequence
# to store the calculated gc content results, initiate another dictionary for it

gc_content_dict = dict()

# populate the gc_content_dict looping over the sequence_dict and using gc_calc func

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    gc_content_dict[identifier] = gcContent
    
# see the final status of gc_content_dict

print(gc_content_dict)

{'Rosalind_6404': 53.75, 'Rosalind_5959': 53.57142857142857, 'Rosalind_0808': 60.91954022988506}


In [7]:
# the whole set does not have to be returned
# rosalind asks for the maximum gc content and its identifier only

# get the maximum gc content from gc_content_dict

max(gc_content_dict.items(), key = lambda x: x[1])

('Rosalind_0808', 60.91954022988506)

In [8]:
# the dictionary:gc_content_dict can also be skipped

# while looping over the sequence dict
# only the highest gc content value and its identifier can be kept in max_variables

max_identifier = ''
max_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    if gcContent > max_gc_content: # compare the new gc content with the previous one
        max_gc_content = gcContent # if the new one is greater, change the value that we keep
        max_identifier = identifier # the same for the identifier
    else:
        continue # else do not change anything - !this line of command is not necessary for real

# look into the final status of the max_variables

print(max_identifier)
print(max_gc_content)

Rosalind_0808
60.91954022988506


In [9]:
# the gc content also can be calculated using biopyhton
# instead of my gc_calc function

from Bio.SeqUtils import GC
GC('AAAATTTTGGGGCCCC')

50.0

In [10]:
# execute the last solution option using biopython function GC() instead gc_cal()

from Bio.SeqUtils import GC

max_identifier = ''
max_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = GC(seq)
    if gcContent > max_gc_content: 
        max_gc_content = gcContent 
        max_identifier = identifier 

# look into the final status of the max_variables

print(max_identifier)
print(max_gc_content)

Rosalind_0808
60.91954022988506


In [11]:
# with the same sense we can even skip creating the first dictionary: sequence_dict
# we can do all calculations while reading the file
# to do that this time I will use biopython's fasta parser

from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.SeqUtils import GC

max_identifier = ''
max_gc_content = 0

with open('gc_test_1.txt', 'r') as file:
    for identifier, sequence in SimpleFastaParser(file):
        gcContent = GC(sequence)
        if gcContent > max_gc_content:
            max_gc_content = gcContent
            max_identifier = identifier
        
# look into the final status of the max_variables

print(max_identifier)
print(max_gc_content)

Rosalind_0808
60.91954022988506


In [12]:
# there might be some other options out there but
# let's make a speed and memory tests for what I studied here

### --A Simple Speed Test

In [13]:
# for testing purpose use a larger dataset: gc_test_2.txt

file = open('gc_test_2.txt', 'r')
content = file.read()
file.close()

print(f'There are {content.count(">")} sequences in the file.')

del content

There are 195840 sequences in the file.


In [14]:
# option 1
# sequence_dict + gc_content_dict
# our own gc_calc func
# sort at the end

In [15]:
%%timeit -n 20

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()
            
def gc_calc(sequence):
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    calculation = 100 * (countG + countC) / totalL
    return calculation

gc_content_dict = dict()

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    gc_content_dict[identifier] = gcContent
    
max(gc_content_dict.items(), key = lambda x: x[1])

1.34 s ± 4.96 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [16]:
# option 2
# only sequence dict
# our own gc_calc func

In [17]:
%%timeit -n 20

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()
            
def gc_calc(sequence):
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    calculation = 100 * (countG + countC) / totalL
    return calculation

max_identifier = ''
max_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    if gcContent > max_gc_content:
        max_gc_content = gcContent 
        max_identifier = identifier

1.33 s ± 3.97 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [18]:
# option 3
# only sequence dict
# biopython's GC() func

In [19]:
%%timeit -n 20

from Bio.SeqUtils import GC

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()

max_identifier = ''
max_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = GC(seq)
    if gcContent > max_gc_content: 
        max_gc_content = gcContent 
        max_identifier = identifier 

1.33 s ± 5.47 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [20]:
# option 4
# no dictionary
# biopython's fasta parser
# biopython's GC() func

In [21]:
%%timeit -n 20

from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.SeqUtils import GC

max_identifier = ''
max_gc_content = 0

with open('gc_test_2.txt', 'r') as file:
    for identifier, sequence in SimpleFastaParser(file):
        gcContent = GC(sequence)
        if gcContent > max_gc_content:
            max_gc_content = gcContent
            max_identifier = identifier

1.86 s ± 15.4 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)


### --A Simple Memory Usage Test

In [22]:
# it seems that based on runtime all solutions work with closer runtimes
# we can measure the memory usage to select the best

%load_ext memory_profiler

In [23]:
# option 1
# sequence_dict + gc_content_dict
# our own gc_calc func
# sort at the end

In [24]:
%%memit -r 20

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()
            
def gc_calc(sequence):
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    calculation = 100 * (countG + countC) / totalL
    return calculation

gc_content_dict = dict()

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    gc_content_dict[identifier] = gcContent
    
max(gc_content_dict.items(), key = lambda x: x[1])

peak memory: 68.54 MiB, increment: 2.66 MiB


In [25]:
# option 2
# only sequence dict
# our own gc_calc func

In [26]:
%%memit -r 20

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()
            
def gc_calc(sequence):
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    calculation = 100 * (countG + countC) / totalL
    return calculation

max_identifier = ''
max_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    if gcContent > max_gc_content:
        max_gc_content = gcContent 
        max_identifier = identifier

peak memory: 67.14 MiB, increment: 0.01 MiB


In [27]:
# option 3
# only sequence dict
# biopython's GC() func

In [28]:
%%memit -r 20

from Bio.SeqUtils import GC

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()

max_identifier = ''
max_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = GC(seq)
    if gcContent > max_gc_content: 
        max_gc_content = gcContent 
        max_identifier = identifier 

peak memory: 67.14 MiB, increment: 0.01 MiB


In [29]:
# option 4
# no dictionary
# biopython's fasta parser
# biopython's GC() func

In [30]:
%%memit -r 20

from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.SeqUtils import GC

max_identifier = ''
max_gc_content = 0

with open('gc_test_2.txt', 'r') as file:
    for identifier, sequence in SimpleFastaParser(file):
        gcContent = GC(sequence)
        if gcContent > max_gc_content:
            max_gc_content = gcContent
            max_identifier = identifier

peak memory: 67.18 MiB, increment: 0.01 MiB


In [31]:
# across all options, the speed and the memory usage almost did not change at all
# will implement the second option
# which eliminates one of the tracking dictionaries
# and also that option uses our own function:gc_calc() and file parsing
# since biopython's GC() and SimpleFastaParser() funcs do not provide a faster calculation
# it is not worth to use functions from outside, otherwise the implementation
# will be dependent to an extra library for nothing

# 4. Implementation

In [32]:
def gc_calc(sequence):
    
    '''
    input
        a dna sequence string
    process
        calculates gc content ratio in given input sequence
    output
        gc content as float dtype
    '''
    
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    
    calculation = 100 * (countG + countC) / totalL
    
    return calculation

In [33]:
def gc(filename):
    
    '''
    input
        a file contains dna strings with their identifiers
    process
        parse input file
        count gc contents of each dna string
    output
        prints identifier and gc content of dna string which has highest gc content to console
        writes and saves answer in a file
    '''
    
    # initiate dictionary to keep track of identifier:sequence
    sequence_dict = dict()
    
    # open, read file and populate sequence_dict
    with open(filename, 'r') as file:
        for line in file:
            if line.startswith('>Rosalind'):
                identifier = line[1:].strip()
                sequence_dict[identifier] = ''
            else:
                sequence_dict[identifier] += line.strip().upper()
    
    # initiate max_variables to keep highest gc content info at any time
    max_identifier = ''
    max_gc_content = 0

    # loop over sequence_dict and define values for answer_variables
    for identifier, seq in sequence_dict.items():
        gcContent = gc_calc(seq)
        if gcContent > max_gc_content:
            max_gc_content = gcContent 
            max_identifier = identifier
    
    # print answer to console
    print('\n\x1B[1mANSWER\x1B[0m\n______\n')
    print(f'{max_identifier}\n{max_gc_content}')
    
    # open file and write answer
    file = open(f'{filename.split(".")[0]}_answer.txt', 'w')
    file.write(f'{max_identifier}\n{max_gc_content}')
    file.close()
    print('\n\n#! The answer has been written into the file:',
          f'\x1B[1m./{filename.split(".")[0]}_answer.txt\x1B[0m\n')

# 5. Execution

In [34]:
gc('gc_test_1.txt')


[1mANSWER[0m
______

Rosalind_0808
60.91954022988506


#! The answer has been written into the file: [1m./gc_test_1_answer.txt[0m



In [35]:
gc('rosalind_gc_9_dataset.txt')


[1mANSWER[0m
______

Rosalind_6265
52.060737527114966


#! The answer has been written into the file: [1m./rosalind_gc_9_dataset_answer.txt[0m



In [36]:
gc('rosalind_gc.txt')


[1mANSWER[0m
______

Rosalind_2512
52.090800477897254


#! The answer has been written into the file: [1m./rosalind_gc_answer.txt[0m



<p style='text-align: right;'>
    <!--<b><font size = '5'>Contact</font></b><br>-->
    <b>Orcun Tasar</b><br>
    <i>Bioinformatician / Data Scientist</i><br>
    orcuntasar |at@| ogr.iu.edu.tr<br>
    tasar.orcun |at@| gmail.com<br>
    <a href = 'https://www.linkedin.com/in/orçun-taşar-7b5992a1/'>Linkedin</a> | <a href = 'https://www.instagram.com/shatranuchor/'>Instagram</a>
</p>