In [6]:
!git commit -m "Initial commit: Created notebook"

[main (root-commit) 71a6479] Initial commit: Created notebook
 1 file changed, 166 insertions(+)
 create mode 100644 assignment_1.ipynb


In [8]:
class DNA:
    def __init__(self, sequence):
        """
        converts the sequence to upper case and check if the sequence is valid 
        """
        # convert the sequence to UPPER CASE
        self.sequence = sequence.upper()
        # valid nucleotides 
        self.valid_nucleotides = ['A', 'T', 'G', 'C']
        self.validate_sequence()

    def validate_sequence(self):
        """
        checks the whole sequence if the sequence is not valid prints error message 
        """
        for base in self.sequence:
            if base not in self.valid_nucleotides:
                print("Error: Invalid nucleotide found:", base)
                print("DNA sequence must only have A, T, G, or C.")
                raise ValueError("Invalid DNA sequence!")  # just stop everything

    def get_length(self):
        """
        returns the length of the sequence 
        """
        return len(self.sequence)

    def count_nucleotides(self):
        """
        counts the number of A,T,G and C in the sequence and returns their count as a dict 
        """
        counts = {'A': 0, 'T': 0, 'G': 0, 'C': 0}

        for base in self.sequence:
            if base == 'A':
                counts['A'] += 1
            elif base == 'T':
                counts['T'] += 1
            elif base == 'G':
                counts['G'] += 1
            elif base == 'C':
                counts['C'] += 1

        return counts

    def get_statistics(self):
        """
        returns length and count 
        """
        length = self.get_length()
        counts = self.count_nucleotides()

        if length == 0:
            gc_content = 0
        else:
            gc_content = ((counts['G'] + counts['C']) / length) * 100

        stats = {
            'Length': length,
            'GC_Content(%)': round(gc_content, 2),
            'Counts': counts
        }

        return stats

    def __str__(self):
        """
        returns string 
        """
        return "DNA Sequence: " + self.sequence + " (Length: " + str(self.get_length()) + ")"


In [10]:
# Testing the DNA class (basic style)

try:
    print("=== Test 1: Valid sequence ===")
    dna1 = DNA("ATGCGTTA")
    print(dna1)
    print("Length:", dna1.get_length())
    print("Counts:", dna1.count_nucleotides())
    print("Statistics:", dna1.get_statistics())

    print("\n=== Test 2: Lowercase sequence ===")
    dna2 = DNA("atgccgat")
    print(dna2)
    print("Statistics:", dna2.get_statistics())

    print("\n=== Test 3: Invalid character ===")
    dna3 = DNA("ATGBXTA")  # should trigger ValueError
except ValueError as e:
    print("Caught error:", e)

=== Test 1: Valid sequence ===
DNA Sequence: ATGCGTTA (Length: 8)
Length: 8
Counts: {'A': 2, 'T': 3, 'G': 2, 'C': 1}
Statistics: {'Length': 8, 'GC_Content(%)': 37.5, 'Counts': {'A': 2, 'T': 3, 'G': 2, 'C': 1}}

=== Test 2: Lowercase sequence ===
DNA Sequence: ATGCCGAT (Length: 8)
Statistics: {'Length': 8, 'GC_Content(%)': 50.0, 'Counts': {'A': 2, 'T': 2, 'G': 2, 'C': 2}}

=== Test 3: Invalid character ===
Error: Invalid nucleotide found: B
DNA sequence must only have A, T, G, or C.
Caught error: Invalid DNA sequence!


In [14]:
!git commit -am "Completed part A question 1"

[main a289dc1] Completed part A question 1
 1 file changed, 89 insertions(+), 72 deletions(-)




In [52]:

def count_individual_nucleotides(dna_obj):
    """
    Use the count_nucleotides() method of DNA class
    and return the dictionary of counts.
    """
    counts = dna_obj.count_nucleotides()
    return counts


def calculate_nucleotide_frequencies(dna_obj):
    """
    Calculate of each nucleotide in the sequence.
    dna_obj - object 
    freq - contains freq of all necleotides 
    """
    counts = dna_obj.count_nucleotides()
    length = dna_obj.get_length()

    if length == 0:
        return {"A": 0, "T": 0, "G": 0, "C": 0}

    freq = {}
    for base in counts:
        freq[base] = round((counts[base] / length) * 100, 2)
    return freq


def generate_analysis_report(dna_obj):
    """
    make report using all the parameters we calculated
    """
    print("=== DNA Nucleotide Analysis Report ===")
    print("Sequence:", dna_obj.sequence)
    print("Length:", dna_obj.get_length())

    counts = count_individual_nucleotides(dna_obj)
    freqs = calculate_nucleotide_frequencies(dna_obj)

    print("\nNucleotide Counts:")
    for base in counts:
        print(base, ":", counts[base])

    print("\nNucleotide Frequencies (%):")
    for base in freqs:
        print(base, ":", freqs[base])
        
    print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")


def compare_two_sequences(dna1, dna2):
    """
    Compare two DNA sequences by their nucleotide composition.
    """
    counts1 = dna1.count_nucleotides()
    counts2 = dna2.count_nucleotides()

    print("=== Comparison of Two DNA Sequences ===")
    print("Seq1:", dna1.sequence)
    print("Seq2:", dna2.sequence)
    print("*******************************************")

    for base in ['A', 'T', 'G', 'C']:
        if counts1[base] > counts2[base]:
            print(f"Seq1 has more {base} ({counts1[base]} vs {counts2[base]})")
        elif counts1[base] < counts2[base]:
            print(f"Seq2 has more {base} ({counts2[base]} vs {counts1[base]})")
        else:
            print(f"Both have equal {base} ({counts1[base]})")
    print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")


In [54]:
print("Single sequence analysis")
dna1 = DNA("ATGCGTTA")
generate_analysis_report(dna1)

print("Compare two sequences")
dna2 = DNA("AATTGGCC")
compare_two_sequences(dna1, dna2)

Single sequence analysis
=== DNA Nucleotide Analysis Report ===
Sequence: ATGCGTTA
Length: 8

Nucleotide Counts:
A : 2
T : 3
G : 2
C : 1

Nucleotide Frequencies (%):
A : 25.0
T : 37.5
G : 25.0
C : 12.5
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
Compare two sequences
=== Comparison of Two DNA Sequences ===
Seq1: ATGCGTTA
Seq2: AATTGGCC
*******************************************
Both have equal A (2)
Seq1 has more T (3 vs 2)
Both have equal G (2)
Seq2 has more C (2 vs 1)
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx


In [50]:
!git commit -am "Completed part A question 2"

[main 41cc2bc] Completed part A question 2
 1 file changed, 160 insertions(+), 2 deletions(-)




In [58]:

def convert_to_upper(sequence):
    """Convert sequence to uppercase """
    seq_upper = ""
    for base in sequence:
        seq_upper += base.upper()
    return seq_upper


def remove_non_nucleotides(sequence):
    """Remove any character that is not A, T, G, or C."""
    valid = ['A', 'T', 'G', 'C','a','t','g','c']
    clean_seq = ""
    for base in sequence:
        base = base.upper()
        if base in valid:
            clean_seq += base
    return clean_seq


def split_into_codons(sequence):
    """Split the sequence into groups of 3 bases (codons)."""
    codons = []
    for i in range(0, len(sequence), 3):
        codons.append(sequence[i:i+3])
    return codons


def merge_fragments(fragments):
    """Join multiple DNA fragments into one long sequence."""
    merged = ""
    for frag in fragments:
        merged += frag
    return merged


In [60]:
raw_seq = "atGc gT@aC12"
print("Original sequence:", raw_seq)
#converting to upper case 
upper_seq = convert_to_upper(raw_seq)
print("Uppercase:", upper_seq)
#removing non nucleotides
#can use convert_to_upper and remove_non_nucletides indifferent of order
clean_seq = remove_non_nucleotides(upper_seq)
print("After removing invalid:", clean_seq)
#split into codons 
codons = split_into_codons(clean_seq)
print("Codons:", codons)
#merging fragmnets 
merged = merge_fragments(["ATGCG", "TAC", "GGA"])
print("Merged fragments:", merged)


Original sequence: atGc gT@aC12
Uppercase: ATGC GT@AC12
After removing invalid: ATGCGTAC
Codons: ['ATG', 'CGT', 'AC']
Merged fragments: ATGCGTACGGA


In [62]:
!git commit -am "Completed part A question 3"

[main a1376db] Completed part A question 3
 1 file changed, 89 insertions(+), 10 deletions(-)




In [66]:
class TranscriptionSystem:
    def __init__(self, sequences, strand_type="coding"):
        """
        Initialize with list of DNA sequences and strand type (coding/template).
        """
        self.sequences = sequences
        self.strand_type = strand_type.lower()
        self.valid_bases = ['A', 'T', 'G', 'C']

    def validate_sequence(self, seq):
        """Check if the DNA sequence has only A, T, G, C."""
        for base in seq:
            if base.upper() not in self.valid_bases:
                return False
        return True

    def transcribe_one(self, seq):
        """Transcribe a single sequence depending on strand type."""
        seq = seq.upper()

        if not self.validate_sequence(seq):
            return "Invalid sequence"

        # If strand is coding: replace T -> U
        if self.strand_type == "coding":
            return seq.replace('T', 'U')

        # If strand is template: use complement rule
        elif self.strand_type == "template":
            complement = {'A': 'U', 'T': 'A', 'G': 'C', 'C': 'G'}
            rna = ""
            for base in seq:
                rna += complement[base]
            return rna

        else:
            return "Invalid strand type! Use 'coding' or 'template'."

    def batch_transcribe(self):
        """Transcribe all sequences in the list."""
        rna_results = []
        for seq in self.sequences:
            rna = self.transcribe_one(seq)
            rna_results.append(rna)
        return rna_results

In [68]:
#third sequence is invalid 
dna_samples = ["ATGCGTAC", "TACGGTTA", "ATGCXGT"] 
#you can put choice template or coding 
strand_choice = "template"

transcriber = TranscriptionSystem(dna_samples, strand_choice)
rna_output = transcriber.batch_transcribe()

print("Strand type:", strand_choice)
print("Transcribed RNA sequences:", rna_output)

Strand type: template
Transcribed RNA sequences: ['UACGCAUG', 'AUGCCAAU', 'Invalid sequence']


In [70]:
!git commit -am "Completed part B question 1"

[main 33df0b2] Completed part B question 1
 1 file changed, 144 insertions(+), 5 deletions(-)


