In [7]:
## program/script to count the numbers of sequences in the given file with multiple fasta sequences

def count_fasta_records(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
     
    # Count the number of lines that start with '>', indicating a header line
    record_count = sum(1 for line in lines if line.startswith('>'))
    return record_count

# usage
fasta_file_path = 'dna.example.fasta' # replace with the actual file path
record_count = count_fasta_records(fasta_file_path)
print(f"The number of records in the FASTA file is: {record_count}")

The number of records in the FASTA file is: 25


In [21]:
##
import csv

def parse_fasta(file_path):
    sequences = {}  # Initialize the dictionary to store sequences
    identifier = None

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                identifier = line[1:].split()[0]  # Get the identifier
                sequences[identifier] = ''  # Initialize the sequence string for this identifier
            else:
                if identifier:  # Ensure identifier is not None before adding sequence data
                    sequences[identifier] += line  # Append sequence data to the current identifier's sequence

    return sequences

def analyze_sequences(sequences):
    lengths = {identifier: len(seq) for identifier, seq in sequences.items()}
    max_length = max(lengths.values())
    min_length = min(lengths.values())

    longest_sequences = [identifier for identifier, length in lengths.items() if length == max_length]
    shortest_sequences = [identifier for identifier, length in lengths.items() if length == min_length]

    return lengths, longest_sequences, shortest_sequences, max_length, min_length

def save_to_csv(lengths, longest_sequences, shortest_sequences, max_length, min_length, output_file):
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        
        # Writing headers
        writer.writerow(['Identifier', 'Sequence Length'])
        
        # Writing all sequence lengths
        for identifier, length in lengths.items():
            writer.writerow([identifier, length])
        
        # Writing details about the longest and shortest sequences
        writer.writerow([])
        writer.writerow(['Longest Sequences', f'Length: {max_length}'])
        for identifier in longest_sequences:
            writer.writerow([identifier, max_length])
        
        writer.writerow([])
        writer.writerow(['Shortest Sequences', f'Length: {min_length}'])
        for identifier in shortest_sequences:
            writer.writerow([identifier, min_length])

# Example usage
fasta_file_path = 'dna.example.fasta'  # Replace with the path to your FASTA file
output_csv_path = 'sequence_analysis_output.csv'  # Replace with the desired output CSV file path

sequences = parse_fasta(fasta_file_path)

if sequences is None:
    print("Error: The sequences dictionary is None. Please check the input file.")
else:
    lengths, longest_sequences, shortest_sequences, max_length, min_length = analyze_sequences(sequences)
    
    # Save the results to a CSV file
    save_to_csv(lengths, longest_sequences, shortest_sequences, max_length, min_length, output_csv_path)
    print(f"Results saved to {output_csv_path}")


Results saved to sequence_analysis_output.csv


In [22]:
from Bio import SeqIO
import csv

def analyze_fasta(fasta_file_path, output_csv_path):
    lengths = {}
    max_length = 0
    min_length = float('inf')
    longest_sequences = []
    shortest_sequences = []

    # Parse the FASTA file
    for record in SeqIO.parse(fasta_file_path, "fasta"):
        seq_id = record.id
        seq_len = len(record.seq)
        lengths[seq_id] = seq_len

        if seq_len > max_length:
            max_length = seq_len
            longest_sequences = [seq_id]
        elif seq_len == max_length:
            longest_sequences.append(seq_id)

        if seq_len < min_length:
            min_length = seq_len
            shortest_sequences = [seq_id]
        elif seq_len == min_length:
            shortest_sequences.append(seq_id)

    # Save the results to a CSV file
    with open(output_csv_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Identifier', 'Sequence Length'])

        for seq_id, seq_len in lengths.items():
            writer.writerow([seq_id, seq_len])

        writer.writerow([])
        writer.writerow(['Longest Sequences', f'Length: {max_length}'])
        for seq_id in longest_sequences:
            writer.writerow([seq_id, max_length])

        writer.writerow([])
        writer.writerow(['Shortest Sequences', f'Length: {min_length}'])
        for seq_id in shortest_sequences:
            writer.writerow([seq_id, min_length])

# Example usage
fasta_file_path = 'dna.example.fasta'  # Replace with the path to your FASTA file
output_csv_path = 'sequence_analysis_output_biopython.csv'  # Replace with the desired output CSV file path

analyze_fasta(fasta_file_path, output_csv_path)
print(f"Results saved to {output_csv_path}")


Results saved to sequence_analysis_output_biopython.csv


In [23]:
from Bio import SeqIO

# Define the start and stop codons
START_CODON = "ATG"
STOP_CODONS = {"TAA", "TAG", "TGA"}

def find_orfs(sequence, reading_frame):
    orfs = []
    seq_len = len(sequence)
    start = None
    
    for i in range(reading_frame - 1, seq_len, 3):
        codon = sequence[i:i + 3]
        if len(codon) < 3:
            continue

        if codon == START_CODON and start is None:
            start = i + 1  # Adjust to 1-based index
        elif codon in STOP_CODONS and start is not None:
            orfs.append((start, i + 3, i + 3 - start))  # (start, end, length)
            start = None
    
    return orfs

def find_longest_orf_in_sequence(sequence, reading_frame):
    orfs = find_orfs(sequence, reading_frame)
    if not orfs:
        return None, 0
    
    longest_orf = max(orfs, key=lambda x: x[2])
    return longest_orf, longest_orf[2]

def find_longest_orf_in_fasta(fasta_file_path, reading_frame):
    longest_orf = None
    longest_orf_length = 0
    longest_orf_identifier = None

    for record in SeqIO.parse(fasta_file_path, "fasta"):
        sequence = str(record.seq)
        orf, length = find_longest_orf_in_sequence(sequence, reading_frame)
        if length > longest_orf_length:
            longest_orf = orf
            longest_orf_length = length
            longest_orf_identifier = record.id
    
    return longest_orf_identifier, longest_orf, longest_orf_length

# Example usage
fasta_file_path = 'dna.example.fasta'  # Replace with the path to your FASTA file
reading_frame = 1  # Choose the reading frame (1, 2, or 3)

# Find the longest ORF in the file
longest_orf_identifier, longest_orf, longest_orf_length = find_longest_orf_in_fasta(fasta_file_path, reading_frame)

if longest_orf:
    print(f"The longest ORF is in sequence: {longest_orf_identifier}")
    print(f"Starts at position: {longest_orf[0]} and ends at position: {longest_orf[1]}")
    print(f"Length of the longest ORF: {longest_orf_length}")
else:
    print("No ORFs found in the specified reading frame.")


The longest ORF is in sequence: gi|142022655|gb|EQ086233.1|323
Starts at position: 2824 and ends at position: 4509
Length of the longest ORF: 1685


In [24]:
from Bio import SeqIO
from collections import defaultdict

def find_repeats(sequence, n):
    repeat_counts = defaultdict(int)
    sequence_length = len(sequence)

    # Find all repeats of length n
    for i in range(sequence_length - n + 1):
        repeat = sequence[i:i + n]
        repeat_counts[repeat] += 1

    return repeat_counts

def analyze_repeats_in_fasta(fasta_file_path, n):
    total_repeat_counts = defaultdict(int)

    for record in SeqIO.parse(fasta_file_path, "fasta"):
        sequence = str(record.seq)
        repeat_counts = find_repeats(sequence, n)

        for repeat, count in repeat_counts.items():
            total_repeat_counts[repeat] += count

    return total_repeat_counts

def find_most_frequent_repeat(total_repeat_counts):
    if not total_repeat_counts:
        return None, 0

    most_frequent_repeat = max(total_repeat_counts.items(), key=lambda x: x[1])
    return most_frequent_repeat

# Example usage
fasta_file_path = 'dna.example.fasta'  # Replace with the path to your FASTA file
repeat_length = 3  # Set the length of the repeat to search for

# Analyze repeats in the FASTA file
total_repeat_counts = analyze_repeats_in_fasta(fasta_file_path, repeat_length)

# Find the most frequent repeat
most_frequent_repeat, count = find_most_frequent_repeat(total_repeat_counts)

# Output the results
print(f"Repeats of length {repeat_length} in the FASTA file:")
for repeat, count in total_repeat_counts.items():
    print(f"{repeat}: {count} times")

if most_frequent_repeat:
    print(f"\nThe most frequent repeat of length {repeat_length} is '{most_frequent_repeat}' occurring {count} times.")
else:
    print(f"\nNo repeats of length {repeat_length} found.")


Repeats of length 3 in the FASTA file:
TCG: 1987 times
CGG: 2111 times
GGG: 748 times
GGC: 1893 times
GCG: 2920 times
CGA: 2054 times
GAA: 873 times
AAG: 546 times
AGG: 565 times
GCA: 1393 times
CAG: 993 times
AGC: 1194 times
CAA: 546 times
AGT: 334 times
GTC: 1059 times
CGT: 1369 times
TCC: 686 times
CCA: 746 times
CAC: 915 times
ACG: 1421 times
CGC: 2810 times
ACC: 725 times
CCG: 2019 times
GCC: 1894 times
CCT: 540 times
CTC: 707 times
TCT: 397 times
CTG: 903 times
TGC: 1343 times
GTG: 891 times
GCT: 1035 times
CTT: 514 times
TTG: 567 times
TGG: 704 times
CAT: 864 times
ATG: 778 times
GAT: 1056 times
ATC: 1084 times
GGA: 706 times
TCA: 670 times
AAA: 393 times
TGA: 637 times
TTC: 894 times
ATA: 263 times
TAG: 191 times
GTT: 599 times
GAC: 1106 times
GTA: 389 times
TAC: 324 times
AAC: 593 times
GGT: 782 times
ATT: 442 times
GAG: 765 times
AAT: 419 times
CCC: 722 times
ACA: 506 times
TTA: 94 times
AGA: 401 times
TGT: 454 times
TAA: 139 times
TTT: 408 times
TAT: 228 times
ACT: 287 times