# 1. Source

Click on the link to go to the source web page of **Rosalind**: [Computing GC Content](https://rosalind.info/problems/gc/)

**Problem**

![Computing GC Content](gc_problem.png 'Computing GC Content')

**Sample Dataset**

\>Rosalind_6404<br>
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC<br>
TCCCACTAATAATTCTGAGG<br>
\>Rosalind_5959<br>
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT<br>
ATATCCATTTGTCAGCAGACACGC<br>
\>Rosalind_0808<br>
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC<br>
TGGGAACCTGCGGGCAGTAGGTGGAAT<br>

**Sample Output**

Rosalind_0808<br>
60.919540

# 2. Workspace

In [1]:
# read the input file 

# initiate a sequence dict

sequence_dict = dict()

# populate this dictionary with the data in the input file

with open('gc_test_1.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()

In [2]:
# look into a couple of k:v pairs as identifier:sequence

sequence_dict

{'Rosalind_6404': 'CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCCTCCCACTAATAATTCTGAGG',
 'Rosalind_5959': 'CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCTATATCCATTTGTCAGCAGACACGC',
 'Rosalind_0808': 'CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGACTGGGAACCTGCGGGCAGTAGGTGGAAT'}

In [3]:
sequence_dict['Rosalind_0808']

'CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGACTGGGAACCTGCGGGCAGTAGGTGGAAT'

In [4]:
# write a sub-function to calculate gc content

def gc_calc(sequence):
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    calculation = 100 * (countG + countC) / totalL
    return calculation

In [5]:
# test out this function

gc_calc('AAAAACCCCCTTTTTGGGGG')

50.0

In [6]:
# after having the sequence dictionary, gc content can be calculated for each sequence
# to store the calculated gc content results, initiate another dictionary for it

gc_content_dict = dict()

# populate the gc_content_dict looping over the sequence_dict and using gc_calc func

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    gc_content_dict[identifier] = gcContent
    
# see the final status of gc_content_dict

print(gc_content_dict)

{'Rosalind_6404': 53.75, 'Rosalind_5959': 53.57142857142857, 'Rosalind_0808': 60.91954022988506}


In [7]:
# the whole set does not have to be returned
# rosalind asks for the maximum gc content and its identifier only

# sort gc_content_dict

sorted(gc_content_dict.items(), key = lambda x: x[1])

[('Rosalind_5959', 53.57142857142857),
 ('Rosalind_6404', 53.75),
 ('Rosalind_0808', 60.91954022988506)]

In [8]:
# extract the final item since this sorting is from the lowest to the highest gc content

sorted(gc_content_dict.items(), key = lambda x: x[1])[-1]

('Rosalind_0808', 60.91954022988506)

In [9]:
# or

sorted(gc_content_dict.items(), key = lambda x: x[1], reverse = True)[0]

('Rosalind_0808', 60.91954022988506)

In [10]:
# however what if someone gives billions of sequence as input?
# will we keep all of the data in gc_content_dict?
# on the other hand sorting the billions of items at the end can take some time
# rosalind already asks for only the highest one

# while looping over the sequence dict
# the highest gc content value and its identifier can be kept in some variables

answer_identifier = ''
answer_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    if gcContent > answer_gc_content: # compare the new gc content with the previous one
        answer_gc_content = gcContent # if the new one is greater, change the value that we keep
        answer_identifier = identifier # the same for the identifier
    else:
        continue # else do not change anything - !this line of command is not necessary for real

# look into the final status of the answer_variables

print(answer_identifier)
print(answer_gc_content)

Rosalind_0808
60.91954022988506


In [11]:
# the gc content also can be calculated using biopyhton
# instead of my gc_calc function

from Bio.SeqUtils import GC
GC('AAAATTTTGGGGCCCC')

50.0

In [12]:
# execute the last solution option using biopython function GC() instead gc_cal()

from Bio.SeqUtils import GC

answer_identifier = ''
answer_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = GC(seq)
    if gcContent > answer_gc_content: 
        answer_gc_content = gcContent 
        answer_identifier = identifier 

# look into the final status of the answer_variables

print(answer_identifier)
print(answer_gc_content)

Rosalind_0808
60.91954022988506


In [13]:
# with the same sense we can even skip creating the first dictionary: sequence_dict
# we can do all calculations while reading the file

answer_identifier = ''
answer_gc_content = 0

temporary_identifier = ''
temporary_sequence = ''

with open('gc_test_1.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            
            gcContent = GC(temporary_sequence)
            if gcContent > answer_gc_content:
                answer_gc_content = gcContent 
                answer_identifier = temporary_identifier
                
            temporary_identifier = line[1:].strip()
            temporary_sequence = ''
        else:
            temporary_sequence += line.strip().upper()
        
# look into the final status of the answer_variables

print(answer_identifier)
print(answer_gc_content)

Rosalind_6404
53.75


In [14]:
# the answer is wrong because I did not compare the last sequence with the previous one
# add one more comparison to the end for the last temporary_sequence comparison

answer_identifier = ''
answer_gc_content = 0

temporary_identifier = ''
temporary_sequence = ''

with open('gc_test_1.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            
            gcContent = GC(temporary_sequence)
            if gcContent > answer_gc_content:
                answer_gc_content = gcContent 
                answer_identifier = temporary_identifier
                
            temporary_identifier = line[1:].strip()
            temporary_sequence = ''
        else:
            temporary_sequence += line.strip().upper()

# the additional last one comparison

gcContent = GC(temporary_sequence)
if gcContent > answer_gc_content:
    answer_gc_content = gcContent 
    answer_identifier = temporary_identifier
        
# look into the final status of the answer_variables

print(answer_identifier)
print(answer_gc_content)

Rosalind_0808
60.91954022988506


In [15]:
# there might be some other options out there but
# let's make a speed and memory tests for what I studied here

### --A Simple Speed Test

In [16]:
# for testing purpose use a larger dataset: gc_test_2.txt

file = open('gc_test_2.txt', 'r')
content = file.read()
file.close()

print(f'There are {content.count(">")} sequences in the file.')

del content

There are 195840 sequences in the file.


In [17]:
# option 1
# sequence_dict + gc_content_dict
# our own gc_calc func
# sort at the end

In [18]:
%%timeit -n 20

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()
            
def gc_calc(sequence):
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    calculation = 100 * (countG + countC) / totalL
    return calculation

gc_content_dict = dict()

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    gc_content_dict[identifier] = gcContent
    
sorted(gc_content_dict.items(), key = lambda x: x[1], reverse = True)[0]

1.36 s ± 10 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [19]:
# option 2
# only sequence dict
# our own gc_calc func

In [20]:
%%timeit -n 20

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()
            
def gc_calc(sequence):
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    calculation = 100 * (countG + countC) / totalL
    return calculation

answer_identifier = ''
answer_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    if gcContent > answer_gc_content:
        answer_gc_content = gcContent 
        answer_identifier = identifier

1.35 s ± 5.84 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [21]:
# option 3
# only sequence dict
# biopython's GC() func

In [22]:
%%timeit -n 20

from Bio.SeqUtils import GC

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()

answer_identifier = ''
answer_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = GC(seq)
    if gcContent > answer_gc_content: 
        answer_gc_content = gcContent 
        answer_identifier = identifier 

1.35 s ± 5.71 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [23]:
# option 4
# no dictionary
# biopython's GC() func

In [24]:
%%timeit -n 20

from Bio.SeqUtils import GC

answer_identifier = ''
answer_gc_content = 0

temporary_identifier = ''
temporary_sequence = ''

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            
            gcContent = GC(temporary_sequence) / 100
            if gcContent > answer_gc_content:
                answer_gc_content = gcContent 
                answer_identifier = temporary_identifier
                
            temporary_identifier = line[1:].strip()
            temporary_sequence = ''
        else:
            temporary_sequence += line.strip().upper()

gcContent = GC(temporary_sequence) / 100
if gcContent >= answer_gc_content:
    answer_gc_content = gcContent 
    answer_identifier = temporary_identifier

2.14 s ± 5.86 ms per loop (mean ± std. dev. of 7 runs, 20 loops each)


### --A Simple Memory Usage Test

In [25]:
# it seems that based on runtime all solutions work with closer runtimes
# we can measure the memory usage to select the best

%load_ext memory_profiler

In [26]:
# option 1
# sequence_dict + gc_content_dict
# our own gc_calc func
# sort at the end

In [27]:
%%memit -r 20

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()
            
def gc_calc(sequence):
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    calculation = 100 * (countG + countC) / totalL
    return calculation

gc_content_dict = dict()

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    gc_content_dict[identifier] = gcContent
    
sorted(gc_content_dict.items(), key = lambda x: x[1], reverse = True)[0]

peak memory: 56.36 MiB, increment: 0.16 MiB


In [28]:
# option 2
# only sequence dict
# our own gc_calc func

In [29]:
%%memit -r 20

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()
            
def gc_calc(sequence):
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    calculation = 100 * (countG + countC) / totalL
    return calculation

answer_identifier = ''
answer_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = gc_calc(seq)
    if gcContent > answer_gc_content:
        answer_gc_content = gcContent 
        answer_identifier = identifier

peak memory: 56.61 MiB, increment: 0.02 MiB


In [30]:
# option 3
# only sequence dict
# biopython's GC() func

In [31]:
%%memit -r 20

from Bio.SeqUtils import GC

sequence_dict = dict()

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            identifier = line[1:].strip()
            sequence_dict[identifier] = ''
        else:
            sequence_dict[identifier] += line.strip().upper()

answer_identifier = ''
answer_gc_content = 0

for identifier, seq in sequence_dict.items():
    gcContent = GC(seq)
    if gcContent > answer_gc_content: 
        answer_gc_content = gcContent 
        answer_identifier = identifier 

peak memory: 55.46 MiB, increment: 0.02 MiB


In [32]:
# option 4
# no dictionary
# biopython's GC() func

In [33]:
%%memit -r 20

from Bio.SeqUtils import GC

answer_identifier = ''
answer_gc_content = 0

temporary_identifier = ''
temporary_sequence = ''

with open('gc_test_2.txt', 'r') as file:
    for line in file:
        if line.startswith('>Rosalind'):
            
            gcContent = GC(temporary_sequence) / 100
            if gcContent > answer_gc_content:
                answer_gc_content = gcContent 
                answer_identifier = temporary_identifier
                
            temporary_identifier = line[1:].strip()
            temporary_sequence = ''
        else:
            temporary_sequence += line.strip().upper()

gcContent = GC(temporary_sequence) / 100
if gcContent >= answer_gc_content:
    answer_gc_content = gcContent 
    answer_identifier = temporary_identifier

peak memory: 55.24 MiB, increment: 0.02 MiB


In [34]:
# across all options, the speed and the memory usage did not change at all
# implement the second option
# which eliminates one of the tracking dictionaries
# and also that option uses our own function: gc_calc()
# since biopython's GC() function does not provide a faster calculation
# it is not worth to use a function from outside, otherwise the implementation
# will be dependent to an extra library for nothing

# 4. Implementation

In [35]:
def gc_calc(sequence):
    
    '''
    input
        a dna sequence string
    process
        calculates gc content ratio in given input sequence
    output
        gc content as float dtype
    '''
    
    countG = sequence.count('G')
    countC = sequence.count('C')
    totalL = len(sequence)
    
    calculation = 100 * (countG + countC) / totalL
    
    return calculation

In [36]:
def gc(filename):
    
    '''
    input
        a file contains dna strings with their identifiers
    process
        parse input file
        count gc contents of each dna string
    output
        prints identifier and gc content of dna string which has highest gc content to console
        writes and saves answer in a file
    '''
    
    # initiate dictionary to keep track of identifier:sequence
    sequence_dict = dict()
    
    # open, read file and populate sequence_dict
    with open(filename, 'r') as file:
        for line in file:
            if line.startswith('>Rosalind'):
                identifier = line[1:].strip()
                sequence_dict[identifier] = ''
            else:
                sequence_dict[identifier] += line.strip().upper()
    
    # initiate answer_variables to keep highest gc content info at any time
    answer_identifier = ''
    answer_gc_content = 0

    # loop over sequence_dict and define values for answer_variables
    for identifier, seq in sequence_dict.items():
        gcContent = gc_calc(seq)
        if gcContent > answer_gc_content:
            answer_gc_content = gcContent 
            answer_identifier = identifier
    
    # print answer to console
    print('\n\x1B[1mANSWER\x1B[0m\n______\n')
    print(f'{answer_identifier}\n{answer_gc_content}')
    
    # open file and write answer
    file = open(f'{filename.split(".")[0]}_answer.txt', 'w')
    file.write(f'{answer_identifier}\n{answer_gc_content}')
    file.close()
    print('\n\n#! The answer has been written into the file:',
          f'\x1B[1m./{filename.split(".")[0]}_answer.txt\x1B[0m\n')

# 5. Execution

In [37]:
gc('gc_test_1.txt')


[1mANSWER[0m
______

Rosalind_0808
60.91954022988506


#! The answer has been written into the file: [1m./gc_test_1_answer.txt[0m



In [38]:
gc('rosalind_gc_9_dataset.txt')


[1mANSWER[0m
______

Rosalind_6265
52.060737527114966


#! The answer has been written into the file: [1m./rosalind_gc_9_dataset_answer.txt[0m



In [39]:
gc('rosalind_gc.txt')


[1mANSWER[0m
______

Rosalind_2512
52.090800477897254


#! The answer has been written into the file: [1m./rosalind_gc_answer.txt[0m



<p style='text-align: right;'>
    <!--<b><font size = '5'>Contact</font></b><br>-->
    <b>Orcun Tasar</b><br>
    <i>Bioinformatician / Data Scientist</i><br>
    orcuntasar |at@| ogr.iu.edu.tr<br>
    tasar.orcun |at@| gmail.com<br>
    <a href = 'https://www.linkedin.com/in/orçun-taşar-7b5992a1/'>Linkedin</a> | <a href = 'https://www.instagram.com/shatranuchor/'>Instagram</a>
</p>