In [1]:
from Bio import SeqIO
from collections import Counter

def find_repeats(sequence, repeat_length):
    repeats = [sequence[i:i + repeat_length] for i in range(len(sequence) - repeat_length + 1)]
    return repeats

def find_orfs(sequence):
    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]

    orfs = []
    start_positions = [i for i in range(0, len(sequence), 3)]

    for start in start_positions:
        codons = [sequence[i:i + 3] for i in range(start, len(sequence), 3)]
        stop_index = next((i for i, codon in enumerate(codons) if codon in stop_codons), None)

        if stop_index is not None:
            orfs.append(sequence[start:start + 3 * (stop_index + 1)])

    return orfs

def main():
    # Load FASTA file
    fasta_file = "C:/Users/Administrator/Downloads/dna2.fasta"
    records = list(SeqIO.parse(fasta_file, "fasta"))

    # Question 1: Number of records in the file
    num_records = len(records)
    print(f"1. Number of records in the file: {num_records}")

    # Question 2: Lengths of sequences, longest, and shortest sequences
    lengths = [len(record.seq) for record in records]
    max_length = max(lengths)
    min_length = min(lengths)
    longest_sequences = [record.id for record in records if len(record.seq) == max_length]
    shortest_sequences = [record.id for record in records if len(record.seq) == min_length]

    print(f"2. Sequence lengths:")
    print(f"   Longest sequence(s) with length {max_length}: {', '.join(longest_sequences)}")
    print(f"   Shortest sequence(s) with length {min_length}: {', '.join(shortest_sequences)}")

    # Question 3: Identify all ORFs
    max_orf_length = 0
    max_orf_sequence_id = ""
    max_orf_start_position = 0

    for record in records:
        sequence_id = record.id
        sequence = str(record.seq)
        orfs = find_orfs(sequence)

        for orf in orfs:
            orf_length = len(orf)

            if orf_length > max_orf_length:
                max_orf_length = orf_length
                max_orf_sequence_id = sequence_id
                max_orf_start_position = sequence.find(orf) + 1

    print(f"\n3. Longest ORF in the file:")
    print(f"   Length: {max_orf_length}")
    print(f"   Sequence ID: {max_orf_sequence_id}")
    print(f"   Start position: {max_orf_start_position}")

    # Question 5: Most frequently occurring repeat of length 6
    repeats_length_6 = [repeat for record in records for repeat in find_repeats(str(record.seq), 6)]
    count_repeats_length_6 = Counter(tuple(repeat) for repeat in repeats_length_6)
    most_common_repeat_6, max_occurrences_repeat_6 = count_repeats_length_6.most_common(1)[0] if count_repeats_length_6 else ("N/A", 0)

    print("\n5. Most frequently occurring repeat of length 6:")
    print(f"   Sequence: {most_common_repeat_6}")
    print(f"   Occurrences: {max_occurrences_repeat_6}")

    # Question 6: Repeats of length 12
    repeats_length_12 = [repeat for record in records for repeat in find_repeats(str(record.seq), 12)]
    count_repeats_length_12 = Counter(tuple(repeat) for repeat in repeats_length_12)
    max_occurrences_repeat_12 = max(count_repeats_length_12.values()) if count_repeats_length_12 else 0
    most_common_repeats_12 = [repeat for repeat, count in count_repeats_length_12.items() if count == max_occurrences_repeat_12]

    print("\n6. Repeats of length 12:")
    print(f"   Number of different 12-base sequences: {len(most_common_repeats_12)}")
    print(f"   Max occurrences of a 12-base sequence: {max_occurrences_repeat_12}")

    # Question 7: Repeats of length 7 with maximum occurrences
    repeats_length_7_candidates = ["GCGCGCA", "CATCGCC", "CGCGCCG", "TGCGCGC"]
    repeats_length_7_counts = Counter(tuple(repeat) for record in records for repeat in find_repeats(str(record.seq), 7))

    max_occurrences_repeat_7 = 0
    most_common_repeat_7 = "N/A"
    for repeat in repeats_length_7_candidates:
        occurrences = repeats_length_7_counts[tuple(repeat)]
        if occurrences > max_occurrences_repeat_7:
            max_occurrences_repeat_7 = occurrences
            most_common_repeat_7 = repeat

    print("\n7. Repeat of length 7 with maximum occurrences:")
    print(f"   Sequence: {most_common_repeat_7}")
    print(f"   Occurrences: {max_occurrences_repeat_7}")

if __name__ == "__main__":
    main()


1. Number of records in the file: 18
2. Sequence lengths:
   Longest sequence(s) with length 4894: gi|142022655|gb|EQ086233.1|255
   Shortest sequence(s) with length 115: gi|142022655|gb|EQ086233.1|346

3. Longest ORF in the file:
   Length: 2694
   Sequence ID: gi|142022655|gb|EQ086233.1|45
   Start position: 85

5. Most frequently occurring repeat of length 6:
   Sequence: ('G', 'C', 'G', 'C', 'G', 'C')
   Occurrences: 153

6. Repeats of length 12:
   Number of different 12-base sequences: 4
   Max occurrences of a 12-base sequence: 10

7. Repeat of length 7 with maximum occurrences:
   Sequence: CGCGCCG
   Occurrences: 63


In [6]:
from Bio import SeqIO
from collections import Counter

def find_repeats(sequence, repeat_length):
    repeats = [sequence[i:i + repeat_length] for i in range(len(sequence) - repeat_length + 1)]
    return repeats

def find_orfs(sequence, frame):
    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]

    orfs = []
    start_positions = [i for i in range(frame, len(sequence), 3)]

    for start in start_positions:
        codons = [sequence[i:i + 3] for i in range(start, len(sequence), 3)]
        stop_index = next((i for i, codon in enumerate(codons) if codon in stop_codons), None)

        if stop_index is not None:
            orfs.append(sequence[start:start + 3 * (stop_index + 1)])

    return orfs

def main():
    # Load multi-FASTA file
    fasta_file = "C:/Users/Administrator/Downloads/dna2.fasta"
    records = list(SeqIO.parse(fasta_file, "fasta"))

    # Question 1: Number of records in the file
    num_records = len(records)
    print(f"1. Number of records in the file: {num_records}")

    # Question 2: Length of the longest sequence
    max_length = max(len(record.seq) for record in records)
    print(f"2. Length of the longest sequence: {max_length}")

    # Question 3: Length of the shortest sequence
    min_length = min(len(record.seq) for record in records)
    print(f"3. Length of the shortest sequence: {min_length}")

    # Question 4: Length of the longest ORF in reading frame 2
    max_orf_length_frame_2 = max(len(orf) for record in records for orf in find_orfs(str(record.seq), frame=1))
    print(f"4. Length of the longest ORF in reading frame 2: {max_orf_length_frame_2}")

    # Question 5: Starting position of the longest ORF in reading frame 3
    max_orf_frame_3 = max(((orf, start) for record in records for start, orf in enumerate(find_orfs(str(record.seq), frame=2))), key=lambda x: len(x[0]))
    print(f"5. Starting position of the longest ORF in reading frame 3: {max_orf_frame_3[1] + 1}")

    # Question 6: Length of the longest ORF in any forward reading frame
    max_orf_length_forward = max(len(orf) for record in records for frame in range(3) for orf in find_orfs(str(record.seq), frame))
    print(f"6. Length of the longest ORF in any forward reading frame: {max_orf_length_forward}")

    # Question 7: Length of the longest forward ORF in a specific sequence
    sequence_id_to_find = "gi|142022655|gb|EQ086233.1|16"
    max_orf_length_sequence = max(len(orf) for record in records if record.id == sequence_id_to_find for frame in range(3) for orf in find_orfs(str(record.seq), frame))
    print(f"7. Length of the longest forward ORF in the sequence with identifier {sequence_id_to_find}: {max_orf_length_sequence}")

    # Question 8: Most frequently occurring repeat of length 6
    repeats_length_6 = [repeat for record in records for repeat in find_repeats(str(record.seq), 6)]
    count_repeats_length_6 = Counter(repeats_length_6)
    most_common_repeat_6, max_occurrences_repeat_6 = count_repeats_length_6.most_common(1)[0] if count_repeats_length_6 else ("N/A", 0)
    print(f"8. Most frequently occurring repeat of length 6:")
    print(f"   Sequence: {most_common_repeat_6}")
    print(f"   Occurrences: {max_occurrences_repeat_6}")

    # Question 9: Most frequently occurring repeat of length 12
    repeats_length_12 = [repeat for record in records for repeat in find_repeats(str(record.seq), 12)]
    count_repeats_length_12 = Counter(repeats_length_12)
    most_common_repeat_12, max_occurrences_repeat_12 = count_repeats_length_12.most_common(1)[0] if count_repeats_length_12 else ("N/A", 0)
    print(f"9. Most frequently occurring repeat of length 12:")
    print(f"   Sequence: {most_common_repeat_12}")
    print(f"   Occurrences: {max_occurrences_repeat_12}")

    # Question 10: Most frequently occurring repeat of length 7 among candidates
    repeats_length_7_candidates = ["TGCGCGC", "GCGCGCA", "CATCGCC", "CGCGCCG"]
    repeats_length_7_counts = Counter(tuple(repeat) for record in records for repeat in find_repeats(str(record.seq), 7))

    max_occurrences_repeat_7 = 0
    most_common_repeat_7 = "N/A"
    for repeat in repeats_length_7_candidates:
        occurrences = repeats_length_7_counts[tuple(repeat)]
        if occurrences > max_occurrences_repeat_7:
            max_occurrences_repeat_7 = occurrences
            most_common_repeat_7 = repeat

    print("\n10. Repeat of length 7 with maximum occurrences:")
    print(f"    Sequence: {most_common_repeat_7}")
    print(f"    Occurrences: {max_occurrences_repeat_7}")

if __name__ == "__main__":
    main()

1. Number of records in the file: 18
2. Length of the longest sequence: 4894
3. Length of the shortest sequence: 115
4. Length of the longest ORF in reading frame 2: 1578
5. Starting position of the longest ORF in reading frame 3: 841
6. Length of the longest ORF in any forward reading frame: 2694
7. Length of the longest forward ORF in the sequence with identifier gi|142022655|gb|EQ086233.1|16: 1656
8. Most frequently occurring repeat of length 6:
   Sequence: GCGCGC
   Occurrences: 153
9. Most frequently occurring repeat of length 12:
   Sequence: CATTCGCCATTC
   Occurrences: 10

10. Repeat of length 7 with maximum occurrences:
    Sequence: CGCGCCG
    Occurrences: 63
