### Write your own FASTQ summary script 
The script should take as input a fastq formatted text file.
The script should produce as output a text file summarizing the following metrics of the input file. 
For each sequence in the input file list the sequence identifier, 
the total counts and frequencies of each nucleotide, 
the complementary base ratios, and the mean quality score.

In [10]:
# Importing the os module to interact with the operating system
import os

# Printing the current working directory using os.getcwd()
print(os.getcwd())  # this will get the current working directory

/home/user


In [4]:
def calculate_complementary_ratios(sequence):
    # Dictionary to store complementary bases
    complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}

    # Generating the complementary sequence using list comprehension
    complementary_sequence = ''.join([complement_dict[base] for base in sequence])

    # Returning the complementary sequence
    return complementary_sequence

In [5]:
def calculate_mean_quality(quality_scores):
    # Calculating the total quality by summing up the ASCII values and adjusting for base quality offset
    total_quality = sum([ord(char) - 33 for char in quality_scores])

    # Calculating the mean quality by dividing the total quality by the number of quality scores
    mean_quality = total_quality / len(quality_scores)

    # Returning the calculated mean quality
    return mean_quality

In [6]:
def summarize_fastq(input_filename, output_filename):
    # Initialize counters and dictionaries
    seq_id = ''
    counts = {'A': 0, 'T': 0, 'C': 0, 'G': 0}
    total_sequences = 0
    total_quality = 0

    with open(input_filename, 'r') as input_file:
        with open(output_filename, 'w') as output_file:
            for line in input_file:
                line = line.strip()
                if line.startswith('@'):
                    # This line is a sequence identifier
                    seq_id = line[1:]
                    sequence = input_file.readline().strip()  # Read the sequence line
                    quality_scores = input_file.readline().strip()  # Read the quality scores line

                    total_sequences += 1
                    total_quality += calculate_mean_quality(quality_scores)

                    # Update nucleotide counts
                    for base in sequence:
                        if base in counts:
                            counts[base] += 1

            # Calculate complementary base ratios (based on the last sequence)
            complementary_sequence = calculate_complementary_ratios(sequence)

            # Write the summary to the output file
            output_file.write(f'Sequence Identifier: {seq_id}\n')
            output_file.write(f'Total Counts: {total_sequences}\n')
            output_file.write(f'Nucleotide Frequencies: {counts}\n')
            output_file.write(f'Complementary Base Ratios: {complementary_sequence}\n')
            output_file.write(f'Mean Quality Score: {total_quality / total_sequences:.2f}\n')

In [8]:
# Input and output filenames
input_filename = "SRR2584863.fastq"
output_filename = "FASTQ_summary_results.txt"

# Calling the function to summarize the FASTQ file
summarize_fastq(input_filename, output_filename)

In [9]:
# Opening the file "FASTQ_summary_results.txt" in read mode
result = open("FASTQ_summary_results.txt", 'r')

# Reading and printing the contents of the file
print(result.read())

# Closing the file to free up system resources
result.close()

Sequence Identifier: SRR2584863.225649 HWI-ST957:244:H73TDADXX:1:2105:11885:63531/1
Total Counts: 175000
Nucleotide Frequencies: {'A': 3912435, 'T': 3985752, 'C': 3953291, 'G': 3942437}
Complementary Base Ratios: CATGACTTGTCTACCTAGTCGCCCGACCACAGGACGTGGGCTAATTCGGCACGCTGTGCTCTTAGCGCTGGGACGCGGCAAAGTTATTAGGGCGCGGGCGCGTCGTGCCAGTCGGGAAGTGCGTCACGCCTCGCACTCTTCGGCTCGAAA
Mean Quality Score: 20.31

