<a href="https://colab.research.google.com/github/Shalinid8/Exploring-Biopython/blob/main/Exploring_Biopython.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Exploring Biopython**

**BLAST search using the DNA sequence**

In [1]:
!pip install biopython  # Install Biopython

from Bio.Blast import NCBIWWW, NCBIXML

def identify_species(dna_sequence):
    # Perform a BLAST search using the DNA sequence
    result_handle = NCBIWWW.qblast("blastn", "nt", dna_sequence)

    # Parse the BLAST result
    blast_record = NCBIXML.read(result_handle)

    # Extract information about the top hit (subject)
    top_hit = blast_record.descriptions[0]

    # Print information about the top hit
    print("Species:", top_hit.title)
    print("Accession:", top_hit.accession)


dna_sequence = """TGGGCCTCATATTTATCCTATATACCATGTTCGTATGGTGGCGCGATGTTCTACGTGAATCCACGTTCGAAGGACATCATACCAAAGTCGTAC
AATTAGGACCTCGATATGGTTTTATTCTGTTTATCGTATCGGAGGTTATGTTCTTTTTTGCTCTTTTTCGGGCTTCTTCTCATTCTTCTTTGGCAC
CTACGGTAGAG"""
identify_species(dna_sequence)

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83
Species: gi|1783584753|gb|MN651324.1| Nicotiana tabacum strain zhongyan90 cytoplasmic male sterility(CMS) line cultivar MSzhongyan90 mitochondrion, complete genome
Accession: MN651324


**DNA to protein Transaltion**

In [2]:
from Bio.Seq import Seq

# Nucleotide sequence
sequence = "TGGGCCTCATATTTATCCTATATACCATGTTCGTATGGTGGCGCGATGTTCTACGTGAATCCACGTTCGAAGGACATCATACCAAAGTCGTACAATTAGGACCTCGATATGGTTTTATTCTGTTTATCGTATCGGAGGTTATGTTCTTTTTTGCTCTTTTTCGGGCTTCTTCTCATTCTTCTTTGGCACCTACGGTAGAG"

# Create a Biopython Seq object
seq_object = Seq(sequence)

# Protein translation
protein_translation = seq_object.translate()

print("Nucleotide sequence:", sequence)
print("Protein translation:", protein_translation)


Nucleotide sequence: TGGGCCTCATATTTATCCTATATACCATGTTCGTATGGTGGCGCGATGTTCTACGTGAATCCACGTTCGAAGGACATCATACCAAAGTCGTACAATTAGGACCTCGATATGGTTTTATTCTGTTTATCGTATCGGAGGTTATGTTCTTTTTTGCTCTTTTTCGGGCTTCTTCTCATTCTTCTTTGGCACCTACGGTAGAG
Protein translation: WASYLSYIPCSYGGAMFYVNPRSKDIIPKSYN*DLDMVLFCLSYRRLCSFLLFFGLLLILLWHLR*




**Analysis records in Fasta File**

In [3]:
def count_fasta_records(fasta_file):
    with open(fasta_file, 'r') as file:
        record_count = 0
        for line in file:
            if line.startswith('>'):
                record_count += 1
        return record_count

# Replace 'your_fasta_file.fasta' with the actual path to your FASTA file
fasta_file_path = '/content/dna2.fasta'
num_records = count_fasta_records(fasta_file_path)

print(f'The number of records in the FASTA file is: {num_records}')

def find_longest_sequence_length(fasta_file):
    with open(fasta_file, 'r') as file:
        max_length = 0
        current_length = 0

        for line in file:
            if line.startswith('>'):
                # If a header line is encountered, reset the current length
                current_length = 0
            else:
                # Count the length of the sequence
                current_length += len(line.strip())

                # Update the maximum length if the current sequence is longer
                max_length = max(max_length, current_length)

    return max_length


max_sequence_length = find_longest_sequence_length(fasta_file_path)

print(f'The length of the longest sequence in the FASTA file is: {max_sequence_length}')


def find_shortest_sequence_length(fasta_file):
    with open(fasta_file, 'r') as file:
        # Set an initial value for the minimum length
        min_length = float('inf')
        current_length = 0

        for line in file:
            if line.startswith('>'):
                # If a header line is encountered, check and update the minimum length
                min_length = min(min_length, current_length) if current_length > 0 else min_length
                # Reset the current length
                current_length = 0
            else:
                # Count the length of the sequence
                current_length += len(line.strip())

        # Check and update the minimum length for the last sequence in the file
        min_length = min(min_length, current_length) if current_length > 0 else min_length

    return min_length

min_sequence_length = find_shortest_sequence_length(fasta_file_path)

print(f'The length of the shortest sequence in the FASTA file is: {min_sequence_length}')

The number of records in the FASTA file is: 18
The length of the longest sequence in the FASTA file is: 4894
The length of the shortest sequence in the FASTA file is: 115


**Finding ORF in DNA Sequence:**

In [4]:
from Bio import SeqIO

def find_frame_orf(fasta_file,frame_value):
    # Iterate through each sequence in the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence = record.seq

        # Find ORF in the second reading frame
        orf_sequence, orf_length = find_orf_in_frame(sequence, frame=frame_value)

        # Print or do something with the identified ORF sequence and length
        print(f"Sequence ID: {record.id}")
        print(f"Second Frame ORF: {orf_sequence}")
        print(f"ORF Length: {orf_length}\n")

def find_orf_in_frame(sequence, frame):
    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]

    orf_sequence = ""
    orf_length = 0

    # Start reading the sequence from the specified frame
    reading_frame = sequence[frame - 1:]

    # Iterate through the sequence in codons
    for i in range(0, len(reading_frame) - 2, 3):
        codon = reading_frame[i:i + 3]

        # Check for start codon
        if codon == start_codon:
            orf_sequence += codon
            orf_length += 3  # Start codon length

            # Continue reading until a stop codon is encountered
            for j in range(i + 3, len(reading_frame) - 2, 3):
                codon = reading_frame[j:j + 3]
                orf_sequence += codon
                orf_length += 3

                if codon in stop_codons:
                    break

    return orf_sequence, orf_length

# Example usage
fasta_file_path = "/content/dna2.fasta"
find_frame_orf(fasta_file_path,2)


Sequence ID: gi|142022655|gb|EQ086233.1|91
Second Frame ORF: ATGCGGTCTTTCGGCTCGAAAGCCAGTTCCAGACCTCCGACGGCGCGCTGAATGCGTCGGCGGGCGGTGCGCTCGACAACCGCGTGTGGGGCGTCCAGGTGAATGCGGTGAATGTAAATGACGGCGGCTTGAATGCGGTCACGCCGGCGTTGCAGCTCGGCGGCGGCTTCCAGTACCAGCAGCGCGGCGGCGACATCGGCTCGGCCAACCAGGTCACGTTGAATGGCGCGCAGGTCGAGGCGGCGCTCGGCGGGGCGGCGTCCGGCTCGACGCAGACCGCGGTCCGGCTCGGGCTGCGGCATCAGTTCTGACGATGCGCGAGAAACACGGGCTGCCGCGTACGCCGCGCGCGAGCCCGTGTTTTTCCGCCGGATTCAGAACCGATGCATCATCCCGACGCGCAACGCCAGCTGGTTGCGGCCCGACGACTGCCCGGCCGTGCCGAGCACGTGCGCGTAGATGTACGTCGAGGTTCGCTTGCTCAGATCGTAGATGTTCTGCTGCGCGCGGCCGCTGAAGACCGTATCGTCGCTCGACAACGCACCGCCCGCGGTCGCGCCGCCGTTGTTGGCCTTCAGGTACGCGACGGCGGCGGAGAACGCGCCCGCCTGGTACTGCACGGCGGCGCTATAGATGCGGTAGATGTCGAGCGTGGGGTCGTACTGGCGGCCGAACGTGAGCGTGCCGAACCGGTCGGAATCGAGGCCGACGTAGATGGCGCTCGCGGCCGATACGGCCGCCATCATCTTCTTCATCGTCGATCTCCAGGTGTGGGCAGCCCACGCGGCGCGGTGCGGTTTCCGACGGCATACGTCAGCACCGGACGCGTGCAGCGAGTCCGTTTGTCGTTAGATGAGCGCGAGCAACGTATAGATGATCGCGAGCGCATACAGCAGGTAAATGGCCTGGTTGTCCATTTGAATGCCGATCGTCGACTTC

**Longest ORF in Fasta File**

In [5]:
from Bio import SeqIO

def find_longest_second_frame_orf(fasta_file):
    # Initialize variables to store the longest ORF information
    longest_orf_sequence = ""
    longest_orf_length = 0
    longest_orf_start_position = 0

    # Iterate through each sequence in the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence = record.seq

        # Find the longest ORF in the second reading frame for the current sequence
        current_orf_sequence, current_orf_length, current_orf_start_position = find_longest_orf_in_frame(sequence, frame=2)

        # Check if the current sequence has a longer ORF
        if current_orf_length > longest_orf_length:
            longest_orf_sequence = current_orf_sequence
            longest_orf_length = current_orf_length
            longest_orf_start_position = current_orf_start_position
            long_Rec_id= record.id

    # Print or return the identified longest ORF information
    print(f"Longest Second Frame ORF: {longest_orf_sequence}")
    print(f"Longest ORF Length: {longest_orf_length}")
    print(f"Start Position: {longest_orf_start_position}")
    print(f"Recod ID: {long_Rec_id}")

# Function to find the longest ORF in a given reading frame
def find_longest_orf_in_frame(sequence, frame):
    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]

    current_orf_sequence = ""
    current_orf_length = 0
    current_orf_start_position = 0

    longest_orf_sequence = ""
    longest_orf_length = 0
    longest_orf_start_position = 0

    # Start reading the sequence from the specified frame
    reading_frame = sequence[frame - 1:]

    # Iterate through the sequence in codons
    for i in range(0, len(reading_frame) - 2, 3):
        codon = reading_frame[i:i + 3]

        # Check for start codon
        if codon == start_codon:
            current_orf_sequence = codon
            current_orf_length = 3  # Start codon length
            current_orf_start_position = i + 1  # Start position in 1-based indexing

            # Continue reading until a stop codon is encountered
            for j in range(i + 3, len(reading_frame) - 2, 3):
                codon = reading_frame[j:j + 3]
                current_orf_sequence += codon
                current_orf_length += 3

                if codon in stop_codons:
                    # Check if the current ORF is longer than the longest one
                    if current_orf_length > longest_orf_length:
                        longest_orf_sequence = current_orf_sequence
                        longest_orf_length = current_orf_length
                        longest_orf_start_position = current_orf_start_position
                    break

    return longest_orf_sequence, longest_orf_length, longest_orf_start_position

# Example usage
fasta_file_path = "/content/dna2.fasta"
find_longest_second_frame_orf(fasta_file_path)


Longest Second Frame ORF: ATGGCAATCCTGATTCGTGGCGGCACCGTGGTCGATGCGGACCGTTCCTACCGCGCGGACGTGCTCTGCGCAGCCCCGGAGGACGGCGGCACGATCCTGCAGATCGCCGGGCAGATCGATGCGCCGGCCGGCGCGACCGTCGTCGATGCGCACGACCAGTACGTGATGCCGGGCGGCATCGATCCGCATACGCACATGGAACTGCCGTTCATGGGCACGACCGCGAGCGACGATTTCTACTCGGGTACGGCCGCCGGGCTCGCGGGCGGCACGACGAGCATCATCGACTTCGTGATCCCGAGCCCGAAGCAGCCGCTGATGGACGCGTTCCATGCCTGGCGCGGCTGGGCCGAGAAGGCGGCGGCCGACTACGGCTTCCACGTGGCCGTGACGTGGTGGGACGAGAGTGTGCACCGCGACATGGGCACGCTCGTGCGCGAACACGGCGTGTCGAGCTTCAAGCACTTCATGGCGTACAAGAACGCGATCATGGCCGACGACGAGGTGCTCGTGAACAGCTTCTCGCGTTCGCTCGAACTCGGCGCGTTGCCGACCGTGCATGCGGAGAACGGCGAGCTCGTGTTCCAGTTGCAGAAGGCGCTGCTCGCGCGCGGGATGACGGGGCCGGAGGCGCATCCGCTGTCGCGGCCGCCGGAGGTCGAGGGTGAGGCGGCGAATCGTGCGATCCGCATTGCGCAGGTGCTCGGCGTGCCGGTGTATATCGTGCATGTGTCCGCGAAGGACGCGGTCGATGCGATCACGAAGGCGCGCAGCGAAGGGCTGCGCGTGTTCGGCGAGGTGCTGCCGGGCCATCTGGTGATCGACGAGGCCGTCTATCGCGATCCGGACTGGACACGTGCGGCCGCGCACGTGATGAGCCCGCCGTTCCGCTCGGCCGAGCACCGCGAGGCGCTGTGGCGCGGGCTGCAGGCAGGGCAGCTGCATACGACGGCAACCGACCACTGCGTGTTCTG

**Longest Forward ORF in Fasta File**

In [6]:
from Bio import SeqIO

def find_longest_forward_frame_orf_in_fasta(fasta_file):
    # Initialize variables to store the longest ORF information
    longest_orf_sequence = ""
    longest_orf_length = 0
    longest_orf_start_position = 0

    # Iterate through each sequence in the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence = record.seq

        # Find ORFs in the forward reading frame for the current sequence
        forward_frame_orfs = find_forward_frame_orfs(sequence)

        # Check if any ORF in the current sequence is longer than the longest one
        for orf_sequence, orf_length, orf_start_position in forward_frame_orfs:
            if orf_length > longest_orf_length:
                longest_orf_sequence = orf_sequence
                longest_orf_length = orf_length
                longest_orf_start_position = orf_start_position

    # Print or return the identified longest ORF information
    print(f"Longest Forward Frame ORF: {longest_orf_sequence}")
    print(f"Longest ORF Length: {longest_orf_length}")
    print(f"Start Position: {longest_orf_start_position}")

# Function to find ORFs in the forward reading frame of a given DNA sequence
def find_forward_frame_orfs(sequence):
    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]

    orfs = []

    # Iterate through the sequence in codons
    for i in range(0, len(sequence) - 2, 3):
        codon = sequence[i:i + 3]

        # Check for start codon
        if codon == start_codon:
            orf_sequence = codon
            orf_length = 3  # Start codon length

            # Continue reading until a stop codon is encountered
            for j in range(i + 3, len(sequence) - 2, 3):
                codon = sequence[j:j + 3]
                orf_sequence += codon
                orf_length += 3

                if codon in stop_codons:
                    orfs.append((orf_sequence, orf_length, i + 1))  # Start position in 1-based indexing
                    break

    return orfs

# Example usage
fasta_file_path = "/content/dna2.fasta"
find_longest_forward_frame_orf_in_fasta(fasta_file_path)


Longest Forward Frame ORF: ATGGAGAAACAGTCTCGCGTTACGCGCGACGGTCGCGGGAGAGTTCTATGCGGTCATCGCTGCCGCGGTCGCGATTGGACTGGTCATGACGTTCGTTCATTTCGACCCGATTCGAGCGCTCTACTGGAGCGCCGTCATCAATGGGATCACGGCAGTGCCCATCATGGTGGTGATGATGCTGATGGCGCAGAGCCGGCGCGTGATGGGCGAGTTCGCAATCAGAGGACCGCTTGCGTGGGGAGGGTGGCTCGCGACGCTCGCCATGGCGCTCGCGGCGGCCGGAATGCTGCTGCCGGGATGAGCCGGCAATCCGGATGGAGAATGCGCATGCCCGCGACGCACCGGCGACGCCTCGCCGGACGGCGGGCGTCGCATTCGCCATTCGCCATTCGCCATTCGCCATTCGCCATTCGCCATTCGCCATTCGCCATTCGCCATTCGCCATTCGCCGAGCGCTCCATCGACGACGGTGGCGGCCACGCCCCGGAATTCGACATGCCTGCATCCTCCGATACGGCGAACCGGCGGGCGTCATCAATCGCGCGCATCCAGCGCGGGCTGAAGCGCGGGCTCGGCCGGCGCTGCCGGTTCATGGCCGCCGTGGCGCGCGGCGGTGGAATGGCCGGGCCGGATCCTGAACCAGATCGCATACATCGCGGGCAGGAACACGAGCGTGAGGACCGTCCCGGCGAACGTGCCGCCGATCAGCGTGTACGCGAGCGTGCCCCAGAACACCGAATGCGTGAGCGGAATGAACGCGAGCACGGCCGCCATCGCGGTAAGAATCACCGGGCGCGCCCGCTGCACGGTCGCTTCGACGACCGCGTGGAACGGATCGAGTCCCGCGTGTTCGTTCTGGTGGATCTGGCCGATCAGGATCAGCGTGTTGCGCATCAGGATCCCCGACAGCGCGATGAGGCCGACCAGCGCATTGATGCCGAACGGCTGCCCGAACAGGATCAGCGTCGGCACC

**Forward ORF's in Fasta File**

In [7]:
from Bio import SeqIO

def find_longest_forward_frame_orf_in_fasta(fasta_file):
    # Initialize variables to store the longest ORF information
    longest_orf_sequence = ""
    longest_orf_length = 0
    longest_orf_start_position = 0

    # Iterate through each sequence in the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequence = record.seq

        # Find ORFs in the forward reading frame for the current sequence
        forward_frame_orfs = find_forward_frame_orfs(sequence)

        # Check if any ORF in the current sequence is longer than the longest one
        for orf_sequence, orf_length, orf_start_position in forward_frame_orfs:
            print(f" Forward Frame ORF: {orf_sequence}")
            print(f" ORF Length: {orf_length}")
            print(f"Start Position: {orf_start_position}")
            print(f"Sequence ID: {record.id}")


    # Print or return the identified longest ORF information
    # print(f"Longest Forward Frame ORF: {longest_orf_sequence}")
    # print(f"Longest ORF Length: {longest_orf_length}")
    # print(f"Start Position: {longest_orf_start_position}")

# Function to find ORFs in the forward reading frame of a given DNA sequence
def find_forward_frame_orfs(sequence):
    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]

    orfs = []

    # Iterate through the sequence in codons
    for i in range(0, len(sequence) - 2, 3):
        codon = sequence[i:i + 3]

        # Check for start codon
        if codon == start_codon:
            orf_sequence = codon
            orf_length = 3  # Start codon length

            # Continue reading until a stop codon is encountered
            for j in range(i + 3, len(sequence) - 2, 3):
                codon = sequence[j:j + 3]
                orf_sequence += codon
                orf_length += 3

                if codon in stop_codons:
                    orfs.append((orf_sequence, orf_length, i + 1))  # Start position in 1-based indexing
                    break

    return orfs

# Example usage
fasta_file_path = "/content/dna2.fasta"
find_longest_forward_frame_orf_in_fasta(fasta_file_path)


 Forward Frame ORF: ATGCCGGCTTTCGCGATCGGCGCGAACACGCCGGCCGGCCTGCTCGCGTGGGGCTTGCCGGCGAATGCGTCGGCGGGCGGTGCGCTCGACAACCGCGTGTGGGGCGTCCAGGTGAACAATGCGGTGAAGTACGTGAGCCCGACGTTCGGCGGATTGTCGTTCGGCGGCCTGTGGGGCTTCGGCAACGTGCCCGGCACGGTCGCGCGCAGCAGCGTGCAAAGCGCGATGCTGTCCTACACGCAAGGCGCGTTCAGCGCCGCGCTCGCTTATTTCGGCCAGCACGATGTAACTGCCGGTGGCAATCTGCGCAATTTCTCGGGCGGTGCAGGCTACAACGTCGGGCAGTTCCGCGTCTTCGGCATGGTGTCGGACGTGCGGATCAGCGCCGCCGCGCCGCTGCGGGCCACGACCTATGACGGCGGCTTGACCTATGCGGTCACGCCGGCGTTGCAGCTCGGCGGCGGCTTCCAGTACCAGCAGCGCGGCGGCGACATCGGCTCGGCCAACCAGGTCACGTTGAGCGCCGACTATTCGCTGTCGAAGCGTACCGGCCTTTACGTGGTATTCGCACGCGGGCACGACAGTGCGTATGGCGCGCAGGTCGAGGCGGCGCTCGGCGGGGCGGCGTCCGGCTCGACGCAGACCGCGGTCCGGCTCGGGCTGCGGCATCAGTTCTGA
 ORF Length: 678
Start Position: 229
Sequence ID: gi|142022655|gb|EQ086233.1|91
 Forward Frame ORF: ATGCTGTCCTACACGCAAGGCGCGTTCAGCGCCGCGCTCGCTTATTTCGGCCAGCACGATGTAACTGCCGGTGGCAATCTGCGCAATTTCTCGGGCGGTGCAGGCTACAACGTCGGGCAGTTCCGCGTCTTCGGCATGGTGTCGGACGTGCGGATCAGCGCCGCCGCGCCGCTGCGGGCCACGACCTATGACGGCGGCTTG

**Longest ORF in Fasta File specific toa. identifier**

In [8]:
from Bio import SeqIO

def find_longest_forward_orf(sequence):
    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]

    longest_orf_sequence = ""
    longest_orf_length = 0
    current_orf_sequence = ""
    current_orf_length = 0

    # Iterate through the sequence in codons
    for i in range(0, len(sequence) - 2, 3):
        codon = sequence[i:i + 3]

        # Check for start codon
        if codon == start_codon:
            current_orf_sequence = codon
            current_orf_length = 3  # Start codon length

            # Continue reading until a stop codon is encountered
            for j in range(i + 3, len(sequence) - 2, 3):
                codon = sequence[j:j + 3]
                current_orf_sequence += codon
                current_orf_length += 3

                if codon in stop_codons:
                    # Check if the current ORF is longer than the longest one
                    if current_orf_length > longest_orf_length:
                        longest_orf_sequence = current_orf_sequence
                        longest_orf_length = current_orf_length
                    break

    return longest_orf_sequence, longest_orf_length

# Example usage for the specific sequence ID
sequence_id = "gi|142022655|gb|EQ086233.1|16"
fasta_file_path = "/content/dna2.fasta"

# Load the sequences from the FASTA file
sequences = {record.id: record.seq for record in SeqIO.parse(fasta_file_path, "fasta")}

# Find the longest forward ORF for the given sequence ID
if sequence_id in sequences:
    sequence = sequences[sequence_id]
    longest_orf_sequence, longest_orf_length = find_longest_forward_orf(sequence)

    # Print or do something with the identified longest forward ORF
    print(f"Longest Forward ORF for {sequence_id}: {longest_orf_sequence}")
    print(f"Longest Forward ORF Length: {longest_orf_length}")
else:
    print(f"Sequence ID {sequence_id} not found in the FASTA file.")


Longest Forward ORF for gi|142022655|gb|EQ086233.1|16: ATGAATCACGCAGCGAATCCCGCCGATCCCGATCGCGCCGCGGCGCAGGGCGGCAGCCTGTACAACGACGATCTCGCGCCGACGACGCCGGCGCAGCGCACGTGGAAGTGGTATCACTTCGCGGCGCTGTGGGTCGGGATGGTGATGAACATCGCGTCGTACATGCTCGCGGCCGGGCTGATCCAGGAAGGCATGTCGCCGTGGCAGGCGGTGACGACGGTGCTGCTCGGCAACCTGATCGTGCTCGTGCCGATGCTGCTGATCGGCCATGCGGGCGCGAAGCACGGGATTCCGTACGCGGTGCTCGTGCGCGCGTCGTTCGGCACGCAGGGGGCGAAGCTGCCGGCGCTGCTGCGCGCGATCGTCGCGTGCGGCTGGTACGGGATCCAGACCTGGCTCGGCGGCAGCGCGATCTATACGCTGCTGAACATCCTGACCGGCAACGCGCTGCATGGCGCCGCGCTGCCGGTCATCGGCATCGGGTTCGGGCAGCTCGCATGCTTCCTCGTGTTCTGGGCGCTGCAGCTCTACTTCATCTGGCATGGCACCGATTCGATCCGCTGGCTCGAAAGCTGGTCGGCGCCGATCAAGGTCGTGATGTGCGTGGCGCTGGTGTGGTGGGCAACGTCGAAGGCGGGCGGCTTCGGCACGATGCTGTCGGCGCCGTCGCAGTTTGCCGCAGGCGGCAAGAAAGCCGGGCTGTTCTGGGCGACCTTCTGGCCGGGGCTGACCGCGATGGTCGGCTTCTGGGCGACGCTCGCGCTGAACATCCCCGACTTCACGCGCTTCGCGCATTCGCAGCGCGACCAGGTGATCGGCCAGTCGATCGGGCTGCCGTTGCCGATGGCGCTGCTGTCGGTGGTGTCGGTCGTCGTGACGTCGGCGACCGTCGTGATCTACGGCAACGCGATCTGGGATCCGATCGACCTGACGAGCCGGATGACG

**Maximum Occurance of Repeats in Fasta File**

In [9]:
from Bio import SeqIO
import re

def find_max_occurrences_in_fasta(fasta_file, repeat):
    max_occurrences = 0

    # Iterate through each sequence in the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        # Get the DNA sequence
        dna_sequence = str(record.seq)

        # Count occurrences of the repeat in the sequence
        occurrences = len(re.findall(f'(?={repeat})', dna_sequence))

        # Update the maximum occurrences if needed
        max_occurrences = max(max_occurrences, occurrences)

    return max_occurrences

# Replace 'your_fasta_file.fasta' with the actual path to your FASTA file
fasta_file_path = '/content/dna2.fasta'

# List of repeats to check
repeats = ["CATCGCC", "GCGGCCG", "CGCGCCG", "TGCGCGC"]

# Find the repeat with the maximum number of occurrences across all sequences
max_repeat = max(repeats, key=lambda repeat: find_max_occurrences_in_fasta(fasta_file_path, repeat))

print(f"The repeat with the maximum number of occurrences across all sequences is: {max_repeat}")


The repeat with the maximum number of occurrences across all sequences is: CGCGCCG


**Maximum Occurance of 12-base sequences Repeats in Fasta File**

In [10]:
from Bio import SeqIO
from collections import Counter

def find_max_repeats_in_fasta(fasta_file, repeat_length):
    all_repeats = []

    # Iterate through each sequence in the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        # Get the DNA sequence
        dna_sequence = str(record.seq)

        # Find all repeats of length 12 in the sequence
        repeats = [dna_sequence[i:i+repeat_length] for i in range(len(dna_sequence)-repeat_length+1)]
        all_repeats.extend(repeats)

    # Count occurrences of each repeat
    repeat_counts = Counter(all_repeats)

    # Find the maximum number of occurrences
    max_occurrences = max(repeat_counts.values(), default=0)

    # Count the number of different 12-base sequences that occur the maximum number of times
    max_occurrences_sequences = sum(1 for count in repeat_counts.values() if count == max_occurrences)

    return max_occurrences_sequences

# Replace 'your_fasta_file.fasta' with the actual path to your FASTA file
fasta_file_path = '/content/dna2.fasta'

# Specify the length of the repeat
repeat_length = 12

# Find the number of different 12-base sequences that occur the maximum number of times
max_occurrences_sequences = find_max_repeats_in_fasta(fasta_file_path, repeat_length)

print(f"The number of different {repeat_length}-base sequences that occur the maximum number of times is: {max_occurrences_sequences}")


The number of different 12-base sequences that occur the maximum number of times is: 4


**Most Frequent Occurance of Repeat in Fasta File**

In [11]:
from Bio import SeqIO
from collections import Counter

def find_most_frequent_repeat_in_fasta(fasta_file, repeat_length):
    all_repeats = []

    # Iterate through each sequence in the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        # Get the DNA sequence
        dna_sequence = str(record.seq)

        # Find all repeats of the specified length in the sequence
        repeats = [dna_sequence[i:i+repeat_length] for i in range(len(dna_sequence)-repeat_length+1)]
        all_repeats.extend(repeats)

    # Count occurrences of each repeat
    repeat_counts = Counter(all_repeats)

    # Find the most frequent repeat and its count
    most_frequent_repeat, most_frequent_count = repeat_counts.most_common(1)[0]

    return most_frequent_repeat, most_frequent_count

# Replace 'your_fasta_file.fasta' with the actual path to your FASTA file
fasta_file_path = '/content/dna2.fasta'

# Specify the length of the repeat
repeat_length = 6

# Find the most frequently occurring repeat and its count
most_frequent_repeat, most_frequent_count = find_most_frequent_repeat_in_fasta(fasta_file_path, repeat_length)

print(f"The most frequent repeat of length {repeat_length} is: {most_frequent_repeat}")
print(f"It occurs {most_frequent_count} times in all sequences.")


The most frequent repeat of length 6 is: GCGCGC
It occurs 153 times in all sequences.
