In [1]:
import os
import random
from tqdm import tqdm

def process_and_sample_sequences(input_file, output_file, sample_rate=0.05):
    """
    Process DNA sequences from an input file, extract a 147 bp segment centered around the middle of each sequence,
    randomly sample a specified percentage of these segments, and save the sampled segments to an output file.

    Parameters:
    input_file (str): Path to the input file containing DNA sequences.
    output_file (str): Path to the output file where the sampled sequences will be saved.
    sample_rate (float): The percentage of sequences to sample (default is 1%).
    """
    extracted_sequences = []
    
    # Step 1: Extract 147 bp segments from the input file
    with open(input_file, 'r') as infile:
        for line_number, line in enumerate(infile, 1):
            sequence = line.strip()
            if not sequence:
                continue
            if len(sequence) != 203:
                print(f"Warning: Line {line_number} sequence length is not 203bp, skipping this sequence: {sequence}")
                continue
            center_index = len(sequence) // 2
            new_sequence = sequence[center_index - 73 : center_index + 74]
            if len(new_sequence) != 147:
                print(f"Error: Line {line_number} extracted sequence length is incorrect, original sequence: {sequence}")
                continue
            extracted_sequences.append(new_sequence)
    
    # Step 2: Randomly sample 1% of the extracted sequences
    num_samples = max(1, int(len(extracted_sequences) * sample_rate))
    sampled_sequences = random.sample(extracted_sequences, num_samples)
    
    # Step 3: Save the sampled sequences to the output file
    with open(output_file, 'w') as outfile:
        for sequence in sampled_sequences:
            outfile.write(sequence + '\n')

# List of input and output file pairs
file_pairs = [
    ('TSS-TTS Sequences/-NNFR-1N-ALL.txt', 'TSS-TTS Sequences/processed_-NNFR-1N-ALL.txt'),
    ('TSS-TTS Sequences/-NNFR-2N-ALL.txt', 'TSS-TTS Sequences/processed_-NNFR-2N-ALL.txt'),
    ('TSS-TTS Sequences/-NNFR-3N-ALL.txt', 'TSS-TTS Sequences/processed_-NNFR-3N-ALL.txt'),
    ('TSS-TTS Sequences/-NNFR+1N-ALL.txt', 'TSS-TTS Sequences/processed_-NNFR+1N-ALL.txt'),
    ('TSS-TTS Sequences/-NNFR+2N-ALL.txt', 'TSS-TTS Sequences/processed_-NNFR+2N-ALL.txt'),
    ('TSS-TTS Sequences/-NNFR+3N-ALL.txt', 'TSS-TTS Sequences/processed_-NNFR+3N-ALL.txt'),
    ('TSS-TTS Sequences/+NNFR-1N-ALL.txt', 'TSS-TTS Sequences/processed_+NNFR-1N-ALL.txt'),
    ('TSS-TTS Sequences/+NNFR-2N-ALL.txt', 'TSS-TTS Sequences/processed_+NNFR-2N-ALL.txt'),
    ('TSS-TTS Sequences/+NNFR-3N-ALL.txt', 'TSS-TTS Sequences/processed_+NNFR-3N-ALL.txt'),
    ('TSS-TTS Sequences/+NNFR+1N-ALL.txt', 'TSS-TTS Sequences/processed_+NNFR+1N-ALL.txt'),
    ('TSS-TTS Sequences/+NNFR+2N-ALL.txt', 'TSS-TTS Sequences/processed_+NNFR+2N-ALL.txt'),
    ('TSS-TTS Sequences/+NNFR+3N-ALL.txt', 'TSS-TTS Sequences/processed_+NNFR+3N-ALL.txt'),
    ('TSS-TTS Sequences/-SNFR-1N-ALL.txt', 'TSS-TTS Sequences/processed_-SNFR-1N-ALL.txt'),
    ('TSS-TTS Sequences/-SNFR-2N-ALL.txt', 'TSS-TTS Sequences/processed_-SNFR-2N-ALL.txt'),
    ('TSS-TTS Sequences/-SNFR-3N-ALL.txt', 'TSS-TTS Sequences/processed_-SNFR-3N-ALL.txt'),
    ('TSS-TTS Sequences/-SNFR+1N-ALL.txt', 'TSS-TTS Sequences/processed_-SNFR+1N-ALL.txt'),
    ('TSS-TTS Sequences/-SNFR+2N-ALL.txt', 'TSS-TTS Sequences/processed_-SNFR+2N-ALL.txt'),
    ('TSS-TTS Sequences/-SNFR+3N-ALL.txt', 'TSS-TTS Sequences/processed_-SNFR+3N-ALL.txt'),
    ('TSS-TTS Sequences/+SNFR-1N-ALL.txt', 'TSS-TTS Sequences/processed_+SNFR-1N-ALL.txt'),
    ('TSS-TTS Sequences/+SNFR-2N-ALL.txt', 'TSS-TTS Sequences/processed_+SNFR-2N-ALL.txt'),
    ('TSS-TTS Sequences/+SNFR-3N-ALL.txt', 'TSS-TTS Sequences/processed_+SNFR-3N-ALL.txt'),
    ('TSS-TTS Sequences/+SNFR+1N-ALL.txt', 'TSS-TTS Sequences/processed_+SNFR+1N-ALL.txt'),
    ('TSS-TTS Sequences/+SNFR+2N-ALL.txt', 'TSS-TTS Sequences/processed_+SNFR+2N-ALL.txt'),
    ('TSS-TTS Sequences/+SNFR+3N-ALL.txt', 'TSS-TTS Sequences/processed_+SNFR+3N-ALL.txt')]

# Process all files in the list with a single progress bar
for input_file, output_file in tqdm(file_pairs, desc="Processing and sampling files"):
    process_and_sample_sequences(input_file, output_file)

Processing and sampling files: 100%|██████████████████████████████████████████████████| 24/24 [00:00<00:00, 428.75it/s]
