In [3]:
def process_fasta(input_file, output_file):
    with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
        sequence_lines = []  # This list will store the lines of a sequence
        for line in f_in:
            # If the line starts with '>', it's a header
            if line.startswith('>'):
                # If there is a sequence in sequence_lines, write it to the output and clear sequence_lines
                if sequence_lines:
                    f_out.write(''.join(sequence_lines).upper() + '\n')
                    sequence_lines.clear()
                # Write the header line to the output
                f_out.write(line)
            else:
                # If the line doesn't start with '>', it's a sequence line. Remove the newline character and add it to sequence_lines
                sequence_lines.append(line.strip())
        # After all lines are read, there might still be a sequence in sequence_lines. If so, write it to the output
        if sequence_lines:
            f_out.write(''.join(sequence_lines).upper() + '\n')

In [6]:
process_fasta('All_VF.fasta', 'All_VF.fasta')

In [7]:
def count_sequences_in_fasta(input_file):
    count = 0
    with open(input_file, 'r') as f_in:
        for line in f_in:
            if line.startswith('>'):
                count += 1
    return count

# Example usage
# num_sequences = count_sequences_in_fasta('path_to_your_input_file.fasta')
# print('Number of sequences:', num_sequences)

# The function is defined but not called. You can use it by uncommenting the last two lines and providing the correct file path.

In [14]:
num_sequences = count_sequences_in_fasta('BVBRC(PATRIC_U).fasta')
print('Number of sequences:', num_sequences)

Number of sequences: 5259


In [12]:
def rename_sequences_in_fasta(input_file, output_file):
    counter_dict = {'>fig': {'name': 'PATRIC', 'counter': 0},
                    '>VF': {'name': 'VFDB', 'counter': 0},
                    '>gi': {'name': 'Victors', 'counter': 0}}
    
    with open(input_file, 'r') as f_in, open(output_file, 'w') as f_out:
        sequence_lines = []  # This list will store the lines of a sequence
        for line in f_in:
            if line.startswith('>'):
                # If there is a sequence in sequence_lines, write it to the output and clear sequence_lines
                if sequence_lines:
                    f_out.write(''.join(sequence_lines).upper() + '\n')
                    sequence_lines.clear()
                
                # Rename the sequence header
                for prefix, info in counter_dict.items():
                    if line.startswith(prefix):
                        info['counter'] += 1
                        line = '>{}{}\n'.format(info['name'], info['counter'])
                        break

                # Write the header line to the output
                f_out.write(line)
            else:
                # If the line doesn't start with '>', it's a sequence line. Remove the newline character and add it to sequence_lines
                sequence_lines.append(line.strip())
        # After all lines are read, there might still be a sequence in sequence_lines. If so, write it to the output
        if sequence_lines:
            f_out.write(''.join(sequence_lines).upper() + '\n')

# Example usage
# rename_sequences_in_fasta('input.fasta', 'output.fasta')

# The function is defined but not called. You can use it by uncommenting the last line and providing the correct file paths.


In [13]:
rename_sequences_in_fasta('All_VF.fasta', 'All_VF(Re).fasta')

In [None]:
import random
from Bio import SeqIO

def parse_clusters(cluster_file):
    with open(cluster_file, 'r') as file:
        clusters = file.read().split('>Cluster ')[1:]  # Split clusters
        clusters = [cluster.split('\n')[1:-1] for cluster in clusters]  # Split sequences in each cluster
    return clusters

def classify_clusters(clusters):
    degs = set()
    vfs = set()
    for cluster in clusters:
        vf_in_cluster = [seq for seq in cluster if not seq.startswith('DEG')]
        if vf_in_cluster:  # If there are any VF sequences in the cluster
            random_vf = random.choice(vf_in_cluster)
            vfs.add(random_vf.split()[1][1:-3])  # Add VF sequence ID to the set
        else:  # If the cluster only contains DEG sequences
            ref_seq = [seq for seq in cluster if seq.endswith('*')][0]
            degs.add(ref_seq.split()[1][1:-3])  # Add DEG sequence ID to the set
    return degs, vfs

def write_selected_sequences_to_fasta(output_file, seq_dict, id_set):
    selected_records = [record for id, record in seq_dict.items() if id in id_set]
    SeqIO.write(selected_records, output_file, 'fasta')

# Parse clusters and classify them
clusters = parse_clusters('')
degs, vfs = classify_clusters(clusters)

# Create sequence dictionaries from the fasta files
vf_dict = create_dict_from_fasta('All_VF.fasta')
deg_dict = create_dict_from_fasta('DEG10.fasta')

# Write selected sequences to output files
write_selected_sequences_to_fasta('DEG_output.fasta', deg_dict, degs)
write_selected_sequences_to_fasta('VF_output.fasta', vf_dict, vfs)