In [11]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Data import CodonTable

def find_orfs_and_translate(seq_record, table_id=1, min_protein_length=1):
    sequence = seq_record.seq
    record_id = seq_record.id
    
    codon_table = CodonTable.unambiguous_dna_by_id[table_id]
    
    orfs_found = []

    for frame in range(3):
        frame_seq = sequence[frame:]
        
        start_codon = "ATG"
        stop_codons = ["TAA", "TAG", "TGA"]
        
        i = 0
        while i < len(frame_seq) - 2:
            codon = str(frame_seq[i:i+3])
            if codon == start_codon:
                orf_start_in_full_seq = frame + i
                j = i
                while j < len(frame_seq) - 2:
                    current_codon = str(frame_seq[j:j+3])
                    if current_codon in stop_codons:
                        orf_end_in_full_seq = frame + j + 3
                        
                        orf_nucleotide_seq = sequence[orf_start_in_full_seq:orf_end_in_full_seq]
                        
                        try:
                            polypeptide = orf_nucleotide_seq.translate(table=table_id)
                            if len(polypeptide) >= min_protein_length:
                                orfs_found.append({
                                    "record_id": record_id,
                                    "strand": "+",
                                    "frame": frame + 1,
                                    "start_pos": orf_start_in_full_seq + 1,
                                    "end_pos": orf_end_in_full_seq,
                                    "nucleotide_sequence": str(orf_nucleotide_seq),
                                    "polypeptide": str(polypeptide)
                                })
                            i = j + 3
                            break
                        except Exception as e:
                            print(f"Ошибка трансляции ORF: {e}")
                            i = j + 3
                            break
                    j += 3
                else:
                    i += 3
            else:
                i += 3
    
    reverse_complement_sequence = sequence.reverse_complement()
    for frame in range(3):
        frame_seq = reverse_complement_sequence[frame:]
        
        start_codon = "ATG"
        stop_codons = ["TAA", "TAG", "TGA"]
        
        i = 0
        while i < len(frame_seq) - 2:
            codon = str(frame_seq[i:i+3])
            if codon == start_codon:
                orf_start_in_full_rc_seq = frame + i
                j = i
                while j < len(frame_seq) - 2:
                    current_codon = str(frame_seq[j:j+3])
                    if current_codon in stop_codons:
                        orf_end_in_full_rc_seq = frame + j + 3
                        
                        orf_nucleotide_seq_rc = reverse_complement_sequence[orf_start_in_full_rc_seq:orf_end_in_full_rc_seq]
                        try:
                            polypeptide_rc = orf_nucleotide_seq_rc.translate(table=table_id)
                            if len(polypeptide_rc) >= min_protein_length:
                                orfs_found.append({
                                    "record_id": record_id,
                                    "strand": "-",
                                    "frame": frame + 1,
                                    "start_pos": len(sequence) - (orf_end_in_full_rc_seq) + 1,
                                    "end_pos": len(sequence) - (orf_start_in_full_rc_seq),
                                    "nucleotide_sequence": str(orf_nucleotide_seq_rc),
                                    "polypeptide": str(polypeptide_rc)
                                })
                            i = j + 3
                            break
                        except Exception as e:
                            print(f"Ошибка трансляции ORF на обратной цепи: {e}")
                            i = j + 3
                            break
                    j += 3
                else:
                    i += 3
            else:
                i += 3
    
    return orfs_found

if __name__ == '__main__':
    input_fasta_file = r"D:\bioinformatics\level_1\Homo_sapiens_ENSP00000371040_3_sequence.fa"
    output_orf_file = r"D:\bioinformatics\level_1\founds_orfs"
    min_protein_length = 25

    with open(output_orf_file, 'w') as f:
        f.write("")

    try:
        with open(input_fasta_file, 'r') as f:
            pass
    except FileNotFoundError:
        print(f"Ошибка: Входной FASTA файл '{input_fasta_file}' не найден.")
        print("Пожалуйста, убедитесь, что файл существует и указан правильный путь.")
    else:
        all_orfs = []
        for record in SeqIO.parse(input_fasta_file, "fasta"):
            print(f"Обработка последовательности: {record.id}")
            orfs_for_record = find_orfs_and_translate(record, table_id=1, min_protein_length=min_protein_length)
            all_orfs.extend(orfs_for_record)

        print("\n--- Краткая информация по найденным ORF ---\n")
        print(f"{"Label":<10} {"Strand":<7} {"Frame":<6} {"Start":<8} {"Stop":<8} {"Length (nt | aa)":<20}")
        print(f"{"-"*10} {"-"*7} {"-"*6} {"-"*8} {"-"*8} {"-"*20}")

        orf_counter = 1
        with open(output_orf_file, 'a') as out_f:
            out_f.write("\n--- Краткая информация по найденным ORF ---\n")
            out_f.write(f"{"Label":<10} {"Strand":<7} {"Frame":<6} {"Start":<8} {"Stop":<8} {"Length (nt | aa)":<20}\n")
            out_f.write(f"{"-"*10} {"-"*7} {"-"*6} {"-"*8} {"-"*8} {"-"*20}\n")

            for orf in all_orfs:
                label = f"ORF{orf_counter}"
                length_nt = len(orf["nucleotide_sequence"])
                length_aa = len(orf["polypeptide"])
                
                stop_pos_display = str(orf["end_pos"])
                if not orf["polypeptide"].endswith("*"):
                    stop_pos_display = ">" + stop_pos_display

                print(f"{label:<10} {orf['strand'] :<7} {orf['frame'] :<6} {orf['start_pos'] :<8} {stop_pos_display :<8} {f'{length_nt} | {length_aa}':<20}")
                out_f.write(f"{label:<10} {orf['strand'] :<7} {orf['frame'] :<6} {orf['start_pos'] :<8} {stop_pos_display :<8} {f'{length_nt} | {length_aa}':<20}\n")
                
                out_f.write(f"\n--- Детальная информация для {label} (Последовательность: {orf['record_id']}) ---\n")
                out_f.write(f"  Рамка считывания: {orf['frame']} ({orf['strand']})\n")
                out_f.write(f"  Позиция (1-индекс): {orf['start_pos']} - {orf['end_pos']}\n")
                out_f.write(f"  Нуклеотидная последовательность ORF: {orf['nucleotide_sequence']}\n")
                out_f.write(f"  Полипептид: {orf['polypeptide']}\n")
                out_f.write("\n")

                orf_counter += 1

        print(f"\nПоиск ORF завершен. Результаты сохранены в файл: {output_orf_file}") 

Обработка последовательности: 9

--- Краткая информация по найденным ORF ---

Label      Strand  Frame  Start    Stop     Length (nt | aa)    
---------- ------- ------ -------- -------- --------------------
ORF1       +       1      238      336      99 | 33             
ORF2       +       1      1168     1281     114 | 38            
ORF3       +       1      1294     1371     78 | 26             
ORF4       +       1      3340     3471     132 | 44            
ORF5       +       1      4579     4668     90 | 30             
ORF6       +       2      137      577      441 | 147           
ORF7       +       2      2831     2926     96 | 32             
ORF8       +       2      3374     3490     117 | 39            
ORF9       +       3      318      497      180 | 60            
ORF10      +       3      1089     1208     120 | 40            
ORF11      +       3      2451     2561     111 | 37            
ORF12      +       3      3396     3512     117 | 39            
ORF13      +