In [1]:
import os
import glob
import re
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

def process_seq_file(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
        if len(lines) < 3:
            raise ValueError(f"SEQ file {filename} has insufficient lines")
    
    sequence = re.sub(r'\d', '', lines[-1].strip())  # Remove all digits from the last line
    
    with open(filename, 'w') as f:
        f.writelines(lines[:2])  # Write the first two lines unchanged
        f.write(sequence + '\n')  # Write the modified sequence
    
    return sequence

def correct_ct_file(seq_filename, ct_filename):
    try:
        sequence = process_seq_file(seq_filename)

        with open(ct_filename, 'r') as f:
            ct_lines = f.readlines()

        corrected_lines = [f"{len(sequence)}\t{ct_lines[0].split(maxsplit=2)[2]}"]

        for seq_index, base in enumerate(sequence, start=1):
            if seq_index < len(ct_lines):
                ct_line = ct_lines[seq_index].split()
                if len(ct_line) >= 6:
                    ct_line[0] = str(seq_index)
                    ct_line[1] = base
                    ct_line[2] = str(seq_index - 1)
                    ct_line[3] = str(seq_index + 1 if seq_index < len(sequence) else 0)
                    ct_line[5] = str(seq_index)
                    corrected_lines.append('\t'.join(ct_line) + '\n')
                else:
                    raise ValueError(f"CT file line {seq_index} has insufficient columns")
            else:
                new_line = f"{seq_index}\t{base}\t{seq_index-1}\t{seq_index+1}\t0\t{seq_index}\n"
                corrected_lines.append(new_line)

        with open(ct_filename, 'w') as f:
            f.writelines(corrected_lines)

        return f"Processed: {seq_filename} and {ct_filename}"
    except Exception as e:
        return f"Error processing {seq_filename} and {ct_filename}: {str(e)}"

def process_folder(folder_path):
    seq_files = glob.glob(os.path.join(folder_path, "*.seq"))
    total_files = len(seq_files)

    with ProcessPoolExecutor() as executor:
        futures = []
        for seq_file in seq_files:
            base_name = os.path.splitext(os.path.basename(seq_file))[0]
            ct_files = glob.glob(os.path.join(folder_path, f"{base_name}*.ct"))
            
            if not ct_files:
                print(f"No CT file found for {seq_file}")
            else:
                for ct_file in ct_files:
                    futures.append(executor.submit(correct_ct_file, seq_file, ct_file))
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing files"):
            print(future.result())

if __name__ == "__main__":
    folder_path = "/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq"
    process_folder(folder_path)

KeyboardInterrupt: 