In [1]:
import glob
import numpy as np
import os
import time
from collections import Counter
# Function to read and process only sequences of length 6001 from a FASTA-like file
def read_fasta_sequences_6001(filename):
    sequences = []
    current_sequence = ""    
    with open(filename, 'r') as file:
        for line in file:
            if line.startswith('>'):
                if current_sequence and len(current_sequence) == 6001:
                    sequences.append(current_sequence)
                current_sequence = ""  # Reset for the next sequence
            else:
                current_sequence += line.strip()
        # Add the last sequence if it matches the length
        if current_sequence and len(current_sequence) == 6001:
            sequences.append(current_sequence)
    return sequences
# Function to calculate dinucleotide perplexity and GC content in a single pass
def calculate_perplexity_and_gc(seq, window_size=10):
    seq_len = len(seq)
    if seq_len < window_size:
        return np.array([]), np.array([])
    # Preallocate arrays
    num_windows = seq_len - window_size + 1
    perplexities = np.zeros(num_windows)
    gc_percentages = np.zeros(num_windows)
    # Generate dinucleotides for the entire sequence
    dinucleotides = [seq[i:i + 2] for i in range(seq_len - 1)]
    for i in range(num_windows):
        # Get dinucleotides in the current window
        window_dinucleotides = dinucleotides[i:i + window_size - 1]
        # Calculate dinucleotide probabilities
        dinucleotide_counts = Counter(window_dinucleotides).values()
        probabilities = np.array(list(dinucleotide_counts)) / (window_size - 1)
        # Calculate entropy and perplexity
        entropy = -np.sum(probabilities * np.log2(probabilities))
        perplexities[i] = 2 ** entropy
        # Calculate GC content
        window = seq[i:i + window_size]
        gc_count = window.count('G') + window.count('C')
        gc_percentages[i] = (gc_count / window_size) * 100
    return perplexities, gc_percentages

# Function to save results to a text file
def save_results_to_text(filename, x_values, avg_perplexity, avg_gc_content):
    output_filename = os.path.splitext(filename)[0] + "_results.txt"
    np.savetxt(
        output_filename,
        np.column_stack((x_values, avg_perplexity, avg_gc_content)),
        fmt="%d\t%.4f\t%.2f",
        header="Position\tPerplexity\tGC Content (%)",
        comments=""
    )
    print(f"Results saved to {output_filename}")
# Function to process multiple .txt files and save results
def process_txt_files_6001(pattern):
    files = glob.glob(pattern)
    if not files:
        print(f"No files found matching pattern: {pattern}")
        return
    window_size = 10  # Set window size to 10
    start_time = time.time()
    for file in files:
        # Read and process sequences with length 6001
        sequences = read_fasta_sequences_6001(file)
        if not sequences:
            print(f"No valid sequences of length 6001 found in {file}.")
            continue
        # Prepare arrays to hold positional perplexities and GC content
        num_positions = 6001 - window_size + 1
        total_perplexities = np.zeros(num_positions)
        total_gc_content = np.zeros(num_positions)
        # Process each sequence
        for sequence in sequences:
            perplexities, gc_content = calculate_perplexity_and_gc(sequence, window_size)
            total_perplexities += perplexities
            total_gc_content += gc_content
        # Calculate positional averages
        sequence_count = len(sequences)
        avg_perplexity = total_perplexities / sequence_count
        avg_gc_content = total_gc_content / sequence_count
        x_values = np.arange(1, num_positions + 1)
        # Save results to a text file
        save_results_to_text(file, x_values, avg_perplexity, avg_gc_content)
    elapsed_time = time.time() - start_time
    minutes, seconds = divmod(elapsed_time, 60)
    print(f"Time taken: {int(minutes)} minutes and {seconds:.2f} seconds")
# Process all .txt files matching the specified pattern
process_txt_files_6001("*.txt")

Results saved to p.falciparum_cleaned_results.txt
Results saved to r.norvegicus_cleaned_results.txt
Results saved to s.cerevisiae_cleaned_results.txt
Results saved to s.pombe_cleaned_results.txt
Results saved to z.mays_cleaned_results.txt
Time taken: 129 minutes and 40.83 seconds
