In [4]:
# Import necessary libraries
import re

In [5]:
# Step 1: Load the Stockholm file
stk_file_path = '/content/result.stk'
with open(stk_file_path, 'r') as file:
    stockholm_data = file.readlines()

# Step 2: Extract all IDs from the file
# IDs are typically the first word in lines containing sequences
id_pattern = re.compile(r'^[A-Za-z0-9./-]+')  # Pattern to match the ID format
ids = set()  # Using a set to store unique IDs

for line in stockholm_data:
    match = id_pattern.match(line)
    if match and not line.startswith(("#", "//")):  # Ignore metadata and terminators
        ids.add(match.group())

# Convert the set to a sorted list for consistent processing
ids = sorted(ids)

# Step 3: Filter sequences for each ID and write to .fasta files
def to_fasta_file(id, sequences, output_folder):
    fasta_filename = f"{output_folder}/{id.replace('/', '_')}.fasta"
    with open(fasta_filename, 'w') as fasta_file:
        fasta_file.write(f">{id}\n")
        for seq in sequences:
            sequence = seq.split(" ", 1)[1].strip()  # Extract the sequence portion
            fasta_file.write(sequence + "\n")
    return fasta_filename

# Create an output folder
output_folder = "/content/fasta_files"
import os
os.makedirs(output_folder, exist_ok=True)

# Filter sequences and write to FASTA files
created_files = []
for id in ids:
    id_sequences = [line.strip() for line in stockholm_data if line.startswith(id)]
    if id_sequences:  # If sequences for the ID exist
        fasta_file = to_fasta_file(id, id_sequences, output_folder)
        created_files.append(fasta_file)

# Step 4: Output the list of created FASTA files
print(f"FASTA files created for IDs: {', '.join(ids)}")
print(f"Files are saved in {output_folder}")


FASTA files created for IDs: AAKI02000041.1/24441-24302, AAKJ02000036.1/5658-5797, AAOE01000024.1/17701-17833, AAUR01000042.1/26458-26319, AAUU01000009.1/30696-30557, AAWE01000035.1/31834-31695, AAWF01000006.1/90475-90336, BX950851.1/3157560-3157412, CP000026.1/2029594-2029450, CP000247.1/832790-832934, CP000510.1/2652936-2652780, CP000627.1/583753-583892, CP000679.1/1094763-1094897, CP000698.1/4162311-4162191
Files are saved in /content/fasta_files


In [6]:
# Display lines starting with the first ID
!grep "^CP000627.1/583753-583892" /content/result.stk

# Display lines starting with the second ID
!grep "^CP000247.1/832790-832934" /content/result.stk

CP000627.1/583753-583892   GC--UUGGCCUUAACUCCGAGCUUACCGC-GCUAAGUUUAAACCU-------UUAAAUAUGCGUUGUAAGCCA-GUGACCG----------CUUGUCAC-AAGGGCAG-AA---UUGGAA
CP000627.1/583753-583892   AUGAUUUUGCCUCCCGUAUUUGGAAAGGUGUU-CUGUGGCGCAACAA
CP000247.1/832790-832934   UUAACCACUAAACACUCU-------------AGCCUCUGCACCUGGGUCA--ACUGAUACGGUGCUUUGGCC--GUGACAAUGCUCGUAAAGAUUGCCACCAGGGCGAAGGAA----GAA
CP000247.1/832790-832934   AUGACUUCGCCUCCCGUAUCUGGAAAGGUGUACAUGGCUU-CACAAC


In [10]:
# Construct the exclusion pattern dynamically from the IDs
!grep -v -e "^CP000698.1/4162311-4162191" \
        -e "^CP000679.1/1094763-1094897" \
        -e "^BX950851.1/3157560-3157412" \
        -e "^CP000627.1/583753-583892" \
        -e "^AAKJ02000036.1/5658-5797" \
        -e "^AAKI02000041.1/24441-24302" \
        -e "^AAUR01000042.1/26458-26319" \
        -e "^AAWE01000035.1/31834-31695" \
        -e "^AAWF01000006.1/90475-90336" \
        -e "^AAUU01000009.1/30696-30557" \
        -e "^AAOE01000024.1/17701-17833" \
        -e "^CP000510.1/2652936-2652780" \
        -e "^CP000026.1/2029594-2029450" \
        -e "^CP000247.1/832790-832934" /content/result.stk | wc -l

6


In [11]:
# Concatenate both FASTA files into one
!cat /content/fasta_files/*.fasta > /content/combined_sequences.fasta