In [1]:
import os
import glob

def bpseq_to_ct_seq(bpseq_file):
    # Read bpseq file
    with open(bpseq_file, 'r') as f:
        lines = f.readlines()
    
    # Extract sequence and pairing information
    sequence = ''
    pairing = {}
    for line in lines:
        parts = line.strip().split()
        if len(parts) == 3:
            index, nucleotide, paired_index = parts
            sequence += nucleotide
            pairing[int(index)] = int(paired_index)
    
    # Create ct file content
    ct_content = f"{len(sequence)}\t{os.path.basename(bpseq_file)}\n"
    for i, nucleotide in enumerate(sequence, 1):
        next_index = i + 1 if i < len(sequence) else 0
        paired_index = pairing.get(i, 0)
        ct_content += f"{i}\t{nucleotide}\t{i-1}\t{next_index}\t{paired_index}\t{i}\n"
    
    # Write ct file
    ct_file = bpseq_file.rsplit('.', 1)[0] + '.ct'
    with open(ct_file, 'w') as f:
        f.write(ct_content)
    
    # Write seq file
    seq_file = bpseq_file.rsplit('.', 1)[0] + '.seq'
    with open(seq_file, 'w') as f:
        f.write(sequence)

def process_folder(folder_path):
    # Process all bpseq files in the folder
    for bpseq_file in glob.glob(os.path.join(folder_path, '*.bpseq')):
        bpseq_to_ct_seq(bpseq_file)
        print(f"Processed: {bpseq_file}")

# Usage
folder_path = '/home/ke/Documents/RNA_parser/RNA_parser/data/TAB'  # Replace with your folder path
process_folder(folder_path)

Processed: /home/ke/Documents/RNA_parser/RNA_parser/data/TAB/0.bpseq


In [5]:
import os
import glob

def bpseq_to_ct_seq(bpseq_file, output_folder):
    # Read bpseq file
    with open(bpseq_file, 'r') as f:
        lines = f.readlines()
    
    # Extract sequence and pairing information
    sequence = ''
    pairing = {}
    for line in lines:
        parts = line.strip().split()
        if len(parts) == 3:
            index, nucleotide, paired_index = parts
            sequence += nucleotide
            pairing[int(index)] = int(paired_index)
    
    # Create ct file content
    ct_content = f"{len(sequence)}\t{os.path.basename(bpseq_file)}\n"
    for i, nucleotide in enumerate(sequence, 1):
        next_index = i + 1 if i < len(sequence) else 0
        paired_index = pairing.get(i, 0)
        ct_content += f"{i}\t{nucleotide}\t{i-1}\t{next_index}\t{paired_index}\t{i}\n"
    
    # Write ct file
    base_name = os.path.splitext(os.path.basename(bpseq_file))[0]
    ct_file = os.path.join(output_folder, base_name + '.ct')
    with open(ct_file, 'w') as f:
        f.write(ct_content)
    
    # Write seq file
    seq_file = os.path.join(output_folder, base_name + '.seq')
    with open(seq_file, 'w') as f:
        f.write(sequence)

def process_folder(input_folder, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Process all bpseq files in the input folder
    for bpseq_file in glob.glob(os.path.join(input_folder, '*.bpseq')):
        bpseq_to_ct_seq(bpseq_file, output_folder)
        print(f"Processed: {bpseq_file}")

# Usage
input_folder = '/home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA'  # Replace with your input folder path
output_folder = '/home/ke/Documents/RNA_parser/RNA_parser/data/TAB/TestSetA'  # Replace with your desired output folder path
process_folder(input_folder, output_folder)

Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/535.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/367.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/170.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/181.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/362.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/391.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/400.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/259.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/203.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/248.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/380.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/342.bpseq
Processed: /home/ke/Documents/RNA_biaff/mxfold2-data/data/TestSetA/244.bpseq

In [9]:
import os

def count_bpseq_files(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.seq'):
                count += 1
    return count

# 使用示例
directory_path = '/home/ke/Documents/RNA/archiveII/'  # 替换为你想要搜索的目录路径
result = count_bpseq_files(directory_path)
print(f"在 {directory_path} 及其子目录中共有 {result} 个bpseq文件。")

在 /home/ke/Documents/RNA/archiveII/ 及其子目录中共有 3992 个bpseq文件。


In [None]:
import os
import re
import shutil

# 定义输入文件和目标目录
INPUT_FILE = "/home/ke/Documents/RNA_parser/RNA_parser/data/stra/rnastralign_test_no_redundant.seq"  # 替换为你的输入文件路径
TARGET_DIR = "path/to/your/target_directory"  # 替换为你的目标目录路径

def extract_file_paths(content):
    # 使用正则表达式匹配 ">" 后面的文件路径
    pattern = r'>([^\n]+)'
    matches = re.findall(pattern, content)
    return matches

def copy_files(file_paths, target_dir):
    for file_path in file_paths:
        # 确保源文件存在
        if not os.path.exists(file_path):
            print(f"Warning: {file_path} not found")
            continue

        # 获取文件名和目录
        filename = os.path.basename(file_path)
        dirname = os.path.dirname(file_path)

        # 在目标目录中创建相同的目录结构
        relative_path = os.path.relpath(dirname, start=os.path.commonprefix([dirname, target_dir]))
        new_dir = os.path.join(target_dir, relative_path)
        os.makedirs(new_dir, exist_ok=True)

        # 复制文件
        shutil.copy2(file_path, new_dir)
        print(f"Copied: {file_path} to {new_dir}")

        # 检查并复制对应的 .seq 文件
        seq_file_path = os.path.splitext(file_path)[0] + '.seq'
        if os.path.exists(seq_file_path):
            shutil.copy2(seq_file_path, new_dir)
            print(f"Copied: {seq_file_path} to {new_dir}")
        else:
            print(f"Note: {seq_file_path} not found")

# 读取输入文件
with open(INPUT_FILE, 'r') as f:
    content = f.read()

# 提取文件路径
file_paths = extract_file_paths(content)

# 确保目标目录存在
os.makedirs(TARGET_DIR, exist_ok=True)

# 复制文件
copy_files(file_paths, TARGET_DIR)

print("File organization completed.")

In [10]:
import os
import re
import shutil

# 定义输入文件和目标目录
INPUT_FILE =    "/home/ke/Documents/RNA_parser/RNA_parser/data/stra/rnastralign_test_no_redundant.seq"  # 替换为你的输入文件路径
TARGET_DIR = "/home/ke/Documents/RNA_parser/RNA_parser/data/stra/"  # 替换为你的目标目录路径
PREFIX = "/home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/"  # 添加到每个文件路径前的前缀

def extract_file_paths(content):
    # 使用正则表达式匹配 ">" 后面的文件路径
    pattern = r'>([^\n]+)'
    matches = re.findall(pattern, content)
    # 在每个匹配的路径前添加前缀
    return [os.path.join(PREFIX, match) for match in matches]

def copy_files(file_paths, target_dir):
    for file_path in file_paths:
        # 确保源文件存在
        if not os.path.exists(file_path):
            print(f"Warning: {file_path} not found")
            continue

        # 获取文件名和目录
        filename = os.path.basename(file_path)
        dirname = os.path.dirname(file_path)

        # 在目标目录中创建相同的目录结构
        relative_path = os.path.relpath(dirname, start=os.path.commonprefix([dirname, target_dir]))
        new_dir = os.path.join(target_dir, relative_path)
        os.makedirs(new_dir, exist_ok=True)

        # 复制文件
        shutil.copy2(file_path, new_dir)
        print(f"Copied: {file_path} to {new_dir}")

        # 检查并复制对应的 .seq 文件
        seq_file_path = os.path.splitext(file_path)[0] + '.seq'
        if os.path.exists(seq_file_path):
            shutil.copy2(seq_file_path, new_dir)
            print(f"Copied: {seq_file_path} to {new_dir}")
        else:
            print(f"Note: {seq_file_path} not found")

# 读取输入文件
with open(INPUT_FILE, 'r') as f:
    content = f.read()

# 提取文件路径
file_paths = extract_file_paths(content)

# 确保目标目录存在
os.makedirs(TARGET_DIR, exist_ok=True)

# 复制文件
copy_files(file_paths, TARGET_DIR)

print("File organization completed.")

Copied: /home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/./RNAStrAlign/16S_rRNA_database/Actinobacteria/DQ645600.ct to /home/ke/Documents/RNA_parser/RNA_parser/data/stra/e2efold_dataset_download/raw_data/RNAStralign/RNAStrAlign/16S_rRNA_database/Actinobacteria
Copied: /home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/./RNAStrAlign/16S_rRNA_database/Actinobacteria/DQ645600.seq to /home/ke/Documents/RNA_parser/RNA_parser/data/stra/e2efold_dataset_download/raw_data/RNAStralign/RNAStrAlign/16S_rRNA_database/Actinobacteria
Copied: /home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/./RNAStrAlign/tRNA_database/tdbD00006626.ct to /home/ke/Documents/RNA_parser/RNA_parser/data/stra/e2efold_dataset_download/raw_data/RNAStralign/RNAStrAlign/tRNA_database
Copied: /home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/./RNAStrAlign/tRNA_database/tdbD00006626.seq to /home/ke/Documents/RNA_parser/RNA_parser/data/stra/e2efold_datase

In [17]:
import os
import re
import shutil


# 定义输入文件和目标目录
INPUT_FILE =    "/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/rnastralign_test_no_redundant.seq"  # 替换为你的输入文件路径
TARGET_DIR = "/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/test/ct_seq/"  # 替换为你的目标目录路径
PREFIX = "/home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/"  # 添加到每个文件路径前的前缀

def extract_file_paths(content):
    # 使用正则表达式匹配 ">" 后面的文件路径
    pattern = r'>([^\n]+)'
    matches = re.findall(pattern, content)
    # 在每个匹配的路径前添加前缀
    return [os.path.join(PREFIX, match) for match in matches]

def copy_files(file_paths, target_dir):
    for file_path in file_paths:
        # 确保源文件存在
        if not os.path.exists(file_path):
            print(f"Warning: {file_path} not found")
            continue

        # 获取文件名
        filename = os.path.basename(file_path)

        # 复制 .ct 文件
        shutil.copy2(file_path, target_dir)
        print(f"Copied: {file_path} to {target_dir}")

        # 检查并复制对应的 .seq 文件
        seq_file_path = os.path.splitext(file_path)[0] + '.seq'
        if os.path.exists(seq_file_path):
            shutil.copy2(seq_file_path, target_dir)
            print(f"Copied: {seq_file_path} to {target_dir}")
        else:
            print(f"Note: {seq_file_path} not found")

# 读取输入文件
with open(INPUT_FILE, 'r') as f:
    content = f.read()

# 提取文件路径
file_paths = extract_file_paths(content)

# 确保目标目录存在
os.makedirs(TARGET_DIR, exist_ok=True)

# 复制文件
copy_files(file_paths, TARGET_DIR)

print("File organization completed.")

Copied: /home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/./RNAStrAlign/16S_rRNA_database/Actinobacteria/DQ645600.ct to /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/test/ct_seq/
Copied: /home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/./RNAStrAlign/16S_rRNA_database/Actinobacteria/DQ645600.seq to /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/test/ct_seq/
Copied: /home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/./RNAStrAlign/tRNA_database/tdbD00006626.ct to /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/test/ct_seq/
Copied: /home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/./RNAStrAlign/tRNA_database/tdbD00006626.seq to /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/test/ct_seq/
Copied: /home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/./RNAStrAlign/5S_rRNA_database/Bacteria/B03696.ct to /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/

In [20]:
import os
import glob
import shutil

def process_folder(folder_path):
    # Get all .seq files in the folder
    seq_files = glob.glob(os.path.join(folder_path, "*.seq"))

    for seq_file in seq_files:
        # Get the base name of the seq file (without extension)
        base_name = os.path.splitext(os.path.basename(seq_file))[0]
        
        # Find all corresponding .ct files
        ct_files = glob.glob(os.path.join(folder_path, f"{base_name}*.ct"))
        
        if len(ct_files) > 1:
            print(f"Multiple CT files found for {seq_file}:")
            for ct_file in ct_files:
                # Get the suffix of the ct file (e.g., '_p1', '_p10')
                ct_basename = os.path.basename(ct_file)
                ct_suffix = ct_basename[len(base_name):-3]  # remove .ct extension
                
                # Create a new seq file name with the same suffix
                new_seq_filename = f"{base_name}{ct_suffix}.seq"
                new_seq_file = os.path.join(folder_path, new_seq_filename)
                
                # Copy the original seq file to the new name
                shutil.copy2(seq_file, new_seq_file)
                print(f"  Created: {new_seq_filename}")
            
            # Optionally, remove the original seq file
            # os.remove(seq_file)
            # print(f"  Removed original: {os.path.basename(seq_file)}")
        else:
            print(f"No multiple CT files found for {os.path.basename(seq_file)}")

# Usage
folder_path = "/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq"
process_folder(folder_path)

No multiple CT files found for B01601.seq
No multiple CT files found for AJ781723.seq
No multiple CT files found for B00284.seq
No multiple CT files found for P00385.seq
No multiple CT files found for envi.sequ._TRW-259291.seq
No multiple CT files found for Cho_sp-3.S1046.seq
No multiple CT files found for tdbD00007006.seq
No multiple CT files found for E02337.seq
No multiple CT files found for AY436807.seq
No multiple CT files found for tdbD00001496.seq
No multiple CT files found for AJ841838.seq
No multiple CT files found for CP000061.seq
No multiple CT files found for X60632.seq
No multiple CT files found for B02294.seq
No multiple CT files found for DQ663152.seq
No multiple CT files found for AB249926.seq
No multiple CT files found for tdbD00009542.seq
No multiple CT files found for DQ834173.seq
No multiple CT files found for E01901.seq
No multiple CT files found for AF409022.seq
No multiple CT files found for DQ170922.seq
No multiple CT files found for AY461758.seq
No multiple CT 

SameFileError: '/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/Medi.trun._AC136507.seq' and '/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/Medi.trun._AC136507.seq' are the same file

In [21]:
import os
import glob

def read_seq_file(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
        if len(lines) < 3:
            raise ValueError(f"SEQ file {filename} has insufficient lines")
        seq_name = lines[1].strip()
        sequence = lines[2].strip()
    return seq_name, sequence

def read_ct_file(filename):
    with open(filename, 'r') as f:
        return f.readlines()

def write_corrected_ct_file(filename, corrected_lines):
    with open(filename, 'w') as f:
        f.writelines(corrected_lines)

def correct_ct_file(seq_filename, ct_filename):
    try:
        # Read the sequence file
        seq_name, sequence = read_seq_file(seq_filename)

        # Read the CT file
        ct_lines = read_ct_file(ct_filename)

        # Prepare the corrected lines
        corrected_lines = [f"{len(sequence)}\tENERGY = 0\t{seq_name}\n"]

        ct_index = 1
        for seq_index, base in enumerate(sequence, start=1):
            if base == 'N':
                # Insert a new line for 'N' with connections set to 0
                new_line = f"{seq_index}\tN\t{seq_index-1}\t{seq_index+1}\t0\t{seq_index}\n"
                corrected_lines.append(new_line)
            else:
                # Check if we have a corresponding line in the CT file
                if ct_index < len(ct_lines):
                    ct_line = ct_lines[ct_index].split()
                    if len(ct_line) >= 6:
                        ct_line[0] = str(seq_index)
                        ct_line[2] = str(seq_index - 1)
                        ct_line[3] = str(seq_index + 1 if seq_index < len(sequence) else 0)
                        ct_line[5] = str(seq_index)
                        corrected_lines.append('\t'.join(ct_line) + '\n')
                    else:
                        raise ValueError(f"CT file line {ct_index} has insufficient columns")
                    ct_index += 1
                else:
                    # If CT file is shorter than sequence, add a new line
                    new_line = f"{seq_index}\t{base}\t{seq_index-1}\t{seq_index+1}\t0\t{seq_index}\n"
                    corrected_lines.append(new_line)

        # Write the corrected CT file
        write_corrected_ct_file(ct_filename, corrected_lines)
        print(f"Corrected CT file: {ct_filename}")
    except Exception as e:
        print(f"Error processing {seq_filename} and {ct_filename}: {str(e)}")

def process_folder(folder_path):
    # Get all .seq files in the folder
    seq_files = glob.glob(os.path.join(folder_path, "*.seq"))

    for seq_file in seq_files:
        # Construct the corresponding .ct filename
        base_name = os.path.splitext(os.path.basename(seq_file))[0]
        ct_files = glob.glob(os.path.join(folder_path, f"{base_name}*.ct"))

        if not ct_files:
            print(f"No CT file found for {seq_file}")
        else:
            for ct_file in ct_files:
                correct_ct_file(seq_file, ct_file)

# Usage example
folder_path = "/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq"
process_folder(folder_path)

Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/B01601.ct
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/AJ781723.ct
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/B00284.ct
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/P00385.ct
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/envi.sequ._TRW-259291.ct
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/Cho_sp-3.S1046.ct
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/tdbD00007006.ct
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/E02337.ct
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/AY436807.ct
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RN

KeyboardInterrupt: 

In [26]:
import os
import glob
import re

def process_seq_file(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
        if len(lines) < 3:
            raise ValueError(f"SEQ file {filename} has insufficient lines")
        
    sequence = re.sub(r'\d', '', lines[-1].strip())  # Remove all digits from the last line
    
    # Write the modified content back to the file
    with open(filename, 'w') as f:
        f.writelines(lines[:2])  # Write the first two lines unchanged
        f.write(sequence + '\n')  # Write the modified sequence
    
    return sequence

def read_ct_file(filename):
    with open(filename, 'r') as f:
        return f.readlines()

def write_corrected_ct_file(filename, corrected_lines):
    with open(filename, 'w') as f:
        f.writelines(corrected_lines)

def correct_ct_file(seq_filename, ct_filename):
    try:
        # Process the sequence file and get the modified sequence
        sequence = process_seq_file(seq_filename)

        # Read the CT file
        ct_lines = read_ct_file(ct_filename)

        # Prepare the corrected lines
        corrected_lines = [f"{len(sequence)}\t{ct_lines[0].split(maxsplit=2)[2]}"]  # Keep the original header

        for seq_index, base in enumerate(sequence, start=1):
            if seq_index < len(ct_lines):
                ct_line = ct_lines[seq_index].split()
                if len(ct_line) >= 6:
                    ct_line[0] = str(seq_index)
                    ct_line[1] = base
                    ct_line[2] = str(seq_index - 1)
                    ct_line[3] = str(seq_index + 1 if seq_index < len(sequence) else 0)
                    ct_line[5] = str(seq_index)
                    corrected_lines.append('\t'.join(ct_line) + '\n')
                else:
                    raise ValueError(f"CT file line {seq_index} has insufficient columns")
            else:
                # If CT file is shorter than sequence, add a new line
                new_line = f"{seq_index}\t{base}\t{seq_index-1}\t{seq_index+1}\t0\t{seq_index}\n"
                corrected_lines.append(new_line)

        # Write the corrected CT file
        write_corrected_ct_file(ct_filename, corrected_lines)
        print(f"Processed SEQ file: {seq_filename}")
        print(f"Corrected CT file: {ct_filename}")
    except Exception as e:
        print(f"Error processing {seq_filename} and {ct_filename}: {str(e)}")

def process_folder(folder_path):
    # Get all .seq files in the folder
    seq_files = glob.glob(os.path.join(folder_path, "*.seq"))

    for seq_file in seq_files:
        # Construct the corresponding .ct filename
        base_name = os.path.splitext(os.path.basename(seq_file))[0]
        ct_files = glob.glob(os.path.join(folder_path, f"{base_name}*.ct"))

        if not ct_files:
            print(f"No CT file found for {seq_file}")
        else:
            for ct_file in ct_files:
                correct_ct_file(seq_file, ct_file)

# Usage example
folder_path = "/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq"
process_folder(folder_path)

Processed SEQ file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/B01601.seq
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/B01601.ct
Processed SEQ file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/AJ781723.seq
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/AJ781723.ct
Processed SEQ file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/B00284.seq
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/B00284.ct
Processed SEQ file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/P00385.seq
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/P00385.ct
Processed SEQ file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/envi.sequ._TRW-259291.seq
Corrected CT file: /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstr

In [27]:
import os

def extract_rna_sequence(file_path):
    sequence = ""
    try:
        with open(file_path, 'r') as file:
            # Skip the header line
            next(file)
            for line in file:
                parts = line.split()
                if len(parts) >= 2:
                    sequence += parts[1]
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    
    return sequence

# 假设 CT 文件名为 'input.ct' 并位于脚本所在的同一目录

file_path = '/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/Ceh.S1506-14.ct'

rna_sequence = extract_rna_sequence(file_path)

if rna_sequence:
    print(f"Extracted RNA sequence: {rna_sequence}")
    print(f"Sequence length: {len(rna_sequence)}")
else:
    print("Failed to extract RNA sequence.")

Extracted RNA sequence: AAGGUUGCUUGUCGUGCACAAACCCUUCCGCGGGGCUCACGACACGGAAGCCUGAGCGGCCCCUUUGCGGGGUGGGUUUCGAUACUACGAAGCCGUAACGCUAGUCCUUGGAUCCGUUCCAGGGCGACACCGUCCAAUUGCGGGGAGUUCUUGAGAGCCUUUGCUACCAAGCGACGGGGCAGUAAUCGAGCAAUGCUCGAACACAGCCCCUCGUGGCCGAGCUCGCCAUUUCGAAAUGCGAGUGAGGGUAUGGUGAUAAUGCACUGGCUUUAAGCAAGAUCAUUCGCAGCCAACUCGAAUUUCAACCAUUCGAGGCAGUUCACAGACUGCACGGCGGUGGGUUGCUGCGCCUUGUGCGUUGCGGCCUAAGAUACAGUCGGCGAGUGAGGGAAACCUUUCUCGAGCAUAUGCCAAAGCAGAGCUUCACCGUUCCGUAGGU
Sequence length: 439


In [47]:
import os

def extract_rna_sequence_from_ct(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    sequence = ''
    for line in lines[1:]:  # Skip the header line
        parts = line.split()
        if len(parts) >= 2:
            sequence += parts[1]
    return sequence

def read_seq_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        if len(lines) < 3:
            raise ValueError(f"SEQ file {file_path} has insufficient lines")
        return lines[2].strip()  # Return the third line (index 2)

# Paths to your files
ct_file_path =  '/home/ke/Documents/RNA_parser/RNA_parser/data/test/ct_seq/UNP00465.ct'
seq_file_path = '/home/ke/Documents/RNA_parser/RNA_parser/data/test/ct_seq/UNP00465.seq'

# Extract RNA sequence from CT file
ct_sequence = extract_rna_sequence_from_ct(ct_file_path)
print("RNA sequence extracted from CT file:")
print(ct_sequence)
print(f"CT sequence length: {len(ct_sequence)}")

# Read SEQ file content
seq_content = read_seq_file(seq_file_path)
print("\nContent from SEQ file:")
print(seq_content)
print(f"SEQ content length: {len(seq_content)}")

# Compare the sequences
if ct_sequence == seq_content:
    print("\nThe sequences from CT and SEQ files are identical.")
else:
    print("\nThe sequences from CT and SEQ files are different.")
    # You might want to print the differences here

RNA sequence extracted from CT file:
AACGGAGAGUUNGAUCCUGGCUCAGGACGAACGCUNGCGGCGUGCUUAACACAUGCAAGUCGGACGGUNAGGCCUUUNNGGGGGUNCUCGAGUGGCGAACGGGUGAGUAACACGUGAGGAACGUGCCCUUGACUUCGGGAUAGCUCCAGGAAACUGGUGGUAAUCCCGAAUAUGAGCCUGGCCUGCAUNGGUCGGGUUGGAAAGCUUUAUGCGGUNAGGGAUCGUCUCGCGGCCUAUCAGCUNGUUGGUGGGGUAAUGGCCUACCAAGGCAGCGACGGGUAGCCGGCCUGAGAGGGUGACCGGCCACAUUGGGACUGAGAUACGGCNCAGACUCCUACGGGAGGCAGCAGUGNGGAAUAUUGCACAAUGGGCGCAAGCCNGAUGCAGCNACGCCGCGUGCGGGAUGACGGCCUUCGGGUUGUGAACCGCUUUCAUCCAUGACGAAGCGCAAGUNACGGUAGUNGGAGAAGAAGCACCGGCUAACUACGUGCCAGCAGCCGCGGUGAUACGUAGGGUGCGAGCGUUGUCCGGAAUUAUUGGGCGUAAAGAGCUNGUAGGCGGUUGAUCACGUCGGAAGUCAAAUUCCAGGGCUUAACUCUGGGCUUGCUUUCGAUACGGGUUGACUUGAGGAAUGUAGGGGAGAAUGGAACUCUCGGUGGAGCGGUGGAAUGCGCAGAUAUCGGGAAGAACACCAGUGGCGAAGGCGGUUCUCUGGACAUUUCCUGACGCUGAGAAGCGAAAGCGUGGGGAGCAAACAGGCUUAGAUACCCUGGUAGUCCACGCCGUAAACGGUGGGUACUAGGUGUNGGUCCCUUCCACGGGGUCCGUGCCGUAGCUNACGCAUUAAGUACCCCGCCUGGGGAGUACGGCCGCAAGGCUAAAACUCAAAGGAAUUGACGGGGGCCCGCACAAGCGGCGGAGCAUGCGGAUUAAUUCGAUGNAACGCGAAGAACCUUACCUGGG

In [29]:
import os

def extract_rna_sequence(ct_file_path):
    sequence = ""
    try:
        with open(ct_file_path, 'r') as file:
            next(file)  # Skip the header line
            for line in file:
                parts = line.split()
                if len(parts) >= 2:
                    sequence += parts[1]
    except FileNotFoundError:
        print(f"Error: CT file not found at {ct_file_path}")
    except Exception as e:
        print(f"An error occurred while reading CT file: {str(e)}")
    return sequence

def process_seq_file(seq_file_path, new_sequence):
    try:
        with open(seq_file_path, 'r') as file:
            lines = file.readlines()
        
        if len(lines) >= 3:
            # Replace the third line with the new sequence
            lines[2] = new_sequence + '\n'
        else:
            print(f"Warning: SEQ file {seq_file_path} has less than 3 lines")
            return False

        with open(seq_file_path, 'w') as file:
            file.writelines(lines)
        return True
    except FileNotFoundError:
        print(f"Error: SEQ file not found at {seq_file_path}")
    except Exception as e:
        print(f"An error occurred while processing SEQ file: {str(e)}")
    return False

def main(ct_file_path, seq_file_path):
    # Extract sequence from CT file
    rna_sequence = extract_rna_sequence(ct_file_path)
    if not rna_sequence:
        print("Failed to extract RNA sequence from CT file.")
        return

    # Replace sequence in SEQ file
    if process_seq_file(seq_file_path, rna_sequence):
        print(f"Successfully replaced sequence in {seq_file_path}")
        print(f"New sequence length: {len(rna_sequence)}")
    else:
        print(f"Failed to replace sequence in {seq_file_path}")

# Usage
# ct_file_path = 'path/to/your/ct/file.ct'
# seq_file_path = 'path/to/your/seq/file.seq'
main(ct_file_path, seq_file_path)

Successfully replaced sequence in /home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq/Ceh.S1506-14.seq
New sequence length: 439


In [33]:
import os
import shutil

def get_seq_length(seq_file_path):
    try:
        with open(seq_file_path, 'r') as file:
            lines = file.readlines()
            if len(lines) >= 3:
                return len(lines[2].strip())
    except Exception as e:
        print(f"Error reading SEQ file {seq_file_path}: {str(e)}")
    return None

def get_ct_length(ct_file_path):
    try:
        with open(ct_file_path, 'r') as file:
            first_line = file.readline().strip()
            return int(first_line.split()[0])
    except Exception as e:
        print(f"Error reading CT file {ct_file_path}: {str(e)}")
    return None

def check_and_move_files(source_folder, destination_folder):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    for filename in os.listdir(source_folder):
        if filename.endswith('.seq'):
            seq_file = os.path.join(source_folder, filename)
            ct_file = os.path.join(source_folder, filename[:-4] + '.ct')
            
            if not os.path.exists(ct_file):
                print(f"CT file not found for {filename}")
                continue

            seq_length = get_seq_length(seq_file)
            ct_length = get_ct_length(ct_file)

            if seq_length is None or ct_length is None:
                continue

            if seq_length != ct_length:
                print(f"Length mismatch in {filename}: SEQ={seq_length}, CT={ct_length}")
                shutil.move(seq_file, os.path.join(destination_folder, filename))
                shutil.move(ct_file, os.path.join(destination_folder, filename[:-4] + '.ct'))
                print(f"Moved {filename} and its CT file to {destination_folder}")

# Usage
source_folder = '/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/val/ct_seq'
destination_folder = '/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/stralign_delete'

check_and_move_files(source_folder, destination_folder)
print("Process completed.")

Process completed.


In [46]:
import os
import re

def extract_sequence_from_ct(ct_content):
    # Extract the sequence from the CT file content
    lines = ct_content.split('\n')[2:]  # Skip the first two lines
    sequence = ''.join(line.split()[1] for line in lines if line.strip())
    return sequence

def create_seq_file(ct_file_path, sequence):
    # Create a .seq file with the same name as the .ct file
    seq_file_path = os.path.splitext(ct_file_path)[0] + '.seq'
    file_name = os.path.basename(ct_file_path).split('.')[0]
    
    with open(seq_file_path, 'w') as seq_file:
        seq_file.write(f';\n{file_name}\n{sequence}\n')

def process_folder(folder_path):
    print(folder_path)
    for file_name in os.listdir(folder_path):
        print(file_name)
        if file_name.endswith('.ct'):
            print(file_name)
            ct_file_path = os.path.join(folder_path, file_name)
            seq_file_path = os.path.splitext(ct_file_path)[0] + '.seq'
            
            # Check if corresponding .seq file exists
            if not os.path.exists(seq_file_path):
                with open(ct_file_path, 'r') as ct_file:
                    ct_content = ct_file.read()
                
                sequence = extract_sequence_from_ct(ct_content)
                create_seq_file(ct_file_path, sequence)
                print(f"Created {os.path.basename(seq_file_path)}")

folder_path= '/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/train/ct_seq'
process_folder(folder_path)
# print("Processing complete.")

/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/test/ct_seq
E00605.ct
E00605.ct
B03934.ct
B03934.ct
B05382.ct
B05382.ct
D38577.ct
D38577.ct
AB094641.seq
tdbD00009158.seq
AY162132.ct
AY162132.ct
B05070.ct
B05070.ct
tdbD00005613.seq
tdbD00007992.seq
tdbD00010591.ct
tdbD00010591.ct
DQ899126.ct
DQ899126.ct
DQ677551.ct
DQ677551.ct
DQ978254.ct
DQ978254.ct
AB094652.ct
AB094652.ct
B01189.seq
AB184659.ct
AB184659.ct
D37806.ct
D37806.ct
AJ746156.seq
AB242651.seq
B01788.ct
B01788.ct
tdbD00004709.ct
tdbD00004709.ct
B05656.seq
Borr.burg._AE001118.ct
Borr.burg._AE001118.ct
B04558.seq
DQ139343.seq
X92492.ct
X92492.ct
tdbD00005720.ct
tdbD00005720.ct
AB184389.seq
AB184281.ct
AB184281.ct
AY864650.seq
B04496.ct
B04496.ct
DQ012961.ct
DQ012961.ct
AY859683.seq
AJ968631.seq
Mci.L798-1.seq
B03400.seq
X90810.ct
X90810.ct
tdbD00003728.seq
tdbD00008221.seq
tdbD00003879.ct
tdbD00003879.ct
B06287.ct
B06287.ct
Rr368.ct
Rr368.ct
B02322.seq
B00427.ct
B00427.ct
tdbD00004297.seq
AY743699.seq
B04597.ct
B04597.

In [43]:
process_folder(folder_path)

/home/ke/Documents/RNA_parser/RNA_parser/data/RNAstrAlign/val


In [3]:
import os
import shutil

def copy_small_ct_files(source_folder, target_folder, max_size_kb=1):
    # 创建目标文件夹，如果不存在则创建
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    # 使用 os.walk 递归遍历所有子文件夹
    for root, dirs, files in os.walk(source_folder):
        for filename in files:
            # 检查是否为 .ct 文件
            if filename.endswith('.ct'):
                file_path = os.path.join(root, filename)
                # 获取文件大小（以KB为单位）
                file_size_kb = os.path.getsize(file_path) / 1024  # 转换为KB
                # 如果文件大小小于指定值，将其复制到目标文件夹
                if file_size_kb < max_size_kb:
                    shutil.copy(file_path, target_folder)
                    print(f"文件 {filename} 已复制到 {target_folder}")

# 示例调用
source_folder = "/home/ke/Documents/RNA/e2efold_dataset_download/raw_data/RNAStralign/RNAStrAlign/"  # 替换为你的源文件夹路径
target_folder = "/home/ke/Documents/RNA_parser/RNA_parser/data/vis"  # 替换为你的目标文件夹路径
copy_small_ct_files(source_folder, target_folder)


文件 Lact.sali._CP000233.ct 已复制到 /home/ke/Documents/RNA_parser/RNA_parser/data/vis
文件 Baci.amyl._CP000560.ct 已复制到 /home/ke/Documents/RNA_parser/RNA_parser/data/vis
文件 List.wels._AM263198.ct 已复制到 /home/ke/Documents/RNA_parser/RNA_parser/data/vis


In [7]:
def check_pseudoknot(structure):
    stack = []
    for i, char in enumerate(structure):
        if char == '(':
            stack.append(i)  # 左括号入栈，记录位置
        elif char == ')':
            if stack:
                opening_index = stack.pop()
                # 检查是否有交叉：判断是否有一个右括号的匹配落在另一个未关闭的左括号之前
                for j in stack:
                    if j > opening_index:  # 如果左括号还未关闭，且在该右括号后面，说明有假结
                        return True
    return False

def find_pseudoknot_structures_from_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()  # 读取文件中的每一行
        for i, line in enumerate(lines):
            structure = line.strip()  # 去除行尾的换行符或空格
            if check_pseudoknot(structure):
                print(f"第 {i+1} 行有假结：{structure}")
            else:
                print(f"第 {i+1} 行没有假结：{structure}")

# 文件路径
file_path = "/home/ke/Documents/RNA_parser/RNA_parser/STRoutput/Roberta_experiment/AR/gold.txt"  # 将此路径替换为你的txt文件路径

# 运行检查函数
find_pseudoknot_structures_from_file(file_path)


第 1 行没有假结：........(.(((.....(.((...(((.......(((.(((((((((((((.((((....))))....))))))))..)))))))).......((((.(((((...(((((((......)))))))....)))))))))................................................................................................................................((((((..(...((((..((..((((((((...(((......))).......))))))))..)).......((....))...))))..)..)).))))...)))...)).)....((((((....(...((((.........))))...).))))))..........((((((.(((..(.((((((.(((((....))))))))))).)..)))...((..))))...)))....))).).)..(((.....((((....))))....)))..
