In [3]:
import pandas as pd

# 读取两个 CSV 文件
file1 = 'data_cn.csv'
file2 = 'data_lpb.csv'

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

# 合并这两个 DataFrame
merged_df = pd.concat([df1, df2], ignore_index=True)

# 保存合并后的结果为新的 CSV 文件
merged_df.to_csv('data_plus.csv', index=False)

print("文件合并完成，已保存为 'merged_file.csv'")


文件合并完成，已保存为 'merged_file.csv'


In [1]:
#对数据进行去重处理，去除八条引物完全相同的数据。

import pandas as pd

# 读取原始文件
df = pd.read_csv('data_plus.csv')

# 筛选出指定列
columns_to_check = [
    "F1_reverse_sequence", "F2_sequence", "F3_sequence", "B1_sequence", 
    "B2_reverse_sequence", "B3_reverse_sequence", "LB_sequence", "LF_reverse_sequence"
]

# 去重：删除这八列内容完全相同的行
filtered_df = df.drop_duplicates(subset=columns_to_check)

# 保存筛选后的数据到新的文件
filtered_df.to_csv('7_primer_data_filtered.csv', index=False)

# 打印新文件的行数
print(f"新文件共有 {len(filtered_df)} 行")


新文件共有 469 行


In [2]:
import pandas as pd

# 读取 CSV 文件
file_path = '7_primer_data_filtered.csv'
df = pd.read_csv(file_path)

# 计算 ori_sequence 列中每个序列的长度
df['ori_sequence_length'] = df['ori_sequence'].apply(len)

# 保存更新后的 DataFrame 为新的 CSV 文件
df.to_csv('7_primer_data_filtered.csv', index=False)

print("长度计算完成，已保存为 '7_primer_data_filtered.csv'")


长度计算完成，已保存为 '7_primer_data_filtered.csv'


In [1]:
#提取引物段，并计算引物段长度，过滤掉引物段长度等于0并且大于300的数据。

import pandas as pd
import re

# 读取CSV文件
df = pd.read_csv('7_primer_data_filtered.csv')

# 定义一个函数来提取引物段序列
def extract_primer_segment(row):
    ori_sequence = row['ori_sequence']
    F3_sequence = row['F3_sequence']
    B3_reverse_sequence = row['B3_reverse_sequence']
    
    # 使用正则表达式查找F3和B3之间的序列
    pattern = re.escape(F3_sequence) + '(.*?)' + re.escape(B3_reverse_sequence)
    match = re.search(pattern, ori_sequence)
    
    if match:
        primer_segment = match.group(0)  # 包含F3和B3的序列
        return primer_segment
    else:
        return None

# 应用函数来提取引物段序列
df['primer_segment'] = df.apply(extract_primer_segment, axis=1)

# 计算引物段序列的长度
df['primer_segment_length'] = df['primer_segment'].apply(lambda x: len(x) if x else 0)

# 过滤掉引物段长度为0或大于300的行
df_filtered = df[(df['primer_segment_length'] > 0) & (df['primer_segment_length'] <= 300)]

# 将结果保存到新的CSV文件
df_filtered.to_csv('8_primer_data.csv', index=False)

In [2]:
# 划分数据集

import pandas as pd
from sklearn.model_selection import train_test_split

# 读取CSV文件
df = pd.read_csv('8_primer_data.csv')

# 随机划分数据集
# 首先划分出测试集
train_val, test = train_test_split(df, test_size=0.1, random_state=42)

# 然后从训练集和验证集中划分出验证集
train, val = train_test_split(train_val, test_size=0.1 / 0.9, random_state=42)

# 保存划分后的数据集
train.to_csv('9_train_data.csv', index=False)
val.to_csv('9_val_data.csv', index=False)
test.to_csv('9_test_data.csv', index=False)

In [1]:
#首先进行数据增强300长度。
# 生成标签序列。
#one-hot编码。

import pandas as pd
import random

# 文件路径
input_files = ["9_train_data.csv", "9_val_data.csv", "9_test_data.csv"]
output_files = ["train_data_300.csv", "val_data_300.csv", "test_data_300.csv"]

# 序列与标签的对应规则
primer_to_label = {
    "F3_sequence": 4,
    "F2_sequence": 3,
    "LF_reverse_sequence": 2,
    "F1_reverse_sequence": 1,
    "B1_sequence": 5,
    "LB_sequence": 6,
    "B2_reverse_sequence": 7,
    "B3_reverse_sequence": 8
}

# 碱基到 one-hot 编码的映射
base_to_one_hot = {
    'A': '1000',
    'T': '0100',
    'C': '0010',
    'G': '0001'
}

# 生成标签序列逻辑
def generate_label_sequence(row):
    enhanced_sequence = row['enhanced_sequence']
    label_sequence = [0] * 300  # 修改标签序列长度为300

    for primer, label in primer_to_label.items():
        primer_sequence = row.get(primer, "")
        if primer_sequence and isinstance(primer_sequence, str) and primer_sequence in enhanced_sequence:
            start = enhanced_sequence.index(primer_sequence)
            end = start + len(primer_sequence)
            for i in range(start, end):
                label_sequence[i] = label

    return ''.join(map(str, label_sequence))  # 转为字符串格式

# 将 enhanced_sequence 转换为 one-hot 编码
def convert_to_one_hot(sequence):
    if not isinstance(sequence, str):
        return ''  # 如果序列无效，返回空字符串

    one_hot_sequence = []
    for base in sequence:
        one_hot_sequence.append(base_to_one_hot.get(base, '0000'))  # 非标准碱基填充 '0000'
    return ''.join(one_hot_sequence)  # 连接为一个长字符串

def find_all_occurrences(sequence, sub_sequence):
    """找到所有子序列的位置"""
    positions = []
    pos = -1
    while True:
        pos = sequence.find(sub_sequence, pos + 1)
        if pos == -1:
            break
        positions.append(pos)
    return positions

# 滑动窗口增强和处理文件
def process_file(input_file, output_file):
    # 读取原始数据
    data = pd.read_csv(input_file)
    enhanced_data = []

    # 遍历每一行数据
    for index, row in data.iterrows():
        ori_sequence = row['ori_sequence']
        F3_sequence = row['F3_sequence']
        B3_reverse_sequence = row['B3_reverse_sequence']

        try:
            # 检查是否存在重复的引物序列
            f3_positions = find_all_occurrences(ori_sequence, F3_sequence)
            b3_positions = find_all_occurrences(ori_sequence, B3_reverse_sequence)
            
            if len(f3_positions) > 1 or len(b3_positions) > 1:
                print(f"Skipping row {index} due to duplicate primer sequences.")
                continue
                
            if len(f3_positions) == 0 or len(b3_positions) == 0:
                print(f"Skipping row {index} due to missing primer sequences.")
                continue

            start_A = f3_positions[0]
            end_A = b3_positions[0] + len(B3_reverse_sequence)

            # 如果序列 A 的长度大于 500bp，跳过该行
            if end_A - start_A > 500:
                print(f"Skipping row {index} due to sequence A length > 500.")
                continue

            # 初始化窗口
            start_window = start_A
            end_window = start_window + 300

            # 如果窗口超出原始序列范围，调整起点
            if end_window > len(ori_sequence):
                start_window = max(0, len(ori_sequence) - 300)
                end_window = start_window + 300

            # 滑动窗口增强
            while start_window >= 0 and end_window >= end_A:
                # 检查窗口起点是否为负值
                if start_window < 0:
                    print(f"Skipping window in row {index} due to negative start position.")
                    break
                    
                window_sequence = ori_sequence[start_window:end_window]
                
                # 检查窗口长度
                if len(window_sequence) != 300:
                    print(f"Skipping window in row {index} due to incorrect window length.")
                    break

                # 创建增强行
                new_row = row.copy()
                new_row['enhanced_sequence'] = window_sequence
                new_row['label_sequence'] = generate_label_sequence(new_row)
                new_row['one_hot_encoded'] = convert_to_one_hot(window_sequence)
                new_row = new_row.drop(labels=['ori_sequence'])
                enhanced_data.append(new_row)

                # 滑动窗口向左移动 5bp
                start_window -= 5
                end_window = start_window + 300

        except Exception as e:
            print(f"Error processing row {index}: {str(e)}")
            continue

    # 将增强数据转换为 DataFrame
    enhanced_df = pd.DataFrame(enhanced_data)

    # 统计每个 primer_segment 的数量，并补充不足的部分
    primer_segment_counts = enhanced_df['primer_segment'].value_counts()

    # 对于每个 primer_segment，检查是否有不足25条的情况
    for primer_segment, count in primer_segment_counts.items():
        if count < 25:
            # 找到该 primer_segment 对应的所有行
            rows_to_copy = enhanced_df[enhanced_df['primer_segment'] == primer_segment]
            # 随机复制并补充到25条
            additional_rows = rows_to_copy.sample(25 - count, replace=True)
            enhanced_df = pd.concat([enhanced_df, additional_rows], ignore_index=True)

    # 保存增强数据
    enhanced_df.to_csv(output_file, index=False)
    print(f"Processed data saved to {output_file}")

# 对每个文件进行处理
for input_file, output_file in zip(input_files, output_files):
    process_file(input_file, output_file)


Skipping window in row 1 due to incorrect window length.
Skipping window in row 5 due to incorrect window length.
Skipping row 9 due to duplicate primer sequences.
Skipping window in row 12 due to incorrect window length.
Skipping window in row 14 due to incorrect window length.
Skipping window in row 23 due to incorrect window length.
Skipping window in row 35 due to incorrect window length.
Skipping window in row 36 due to incorrect window length.
Skipping window in row 51 due to incorrect window length.
Skipping window in row 52 due to incorrect window length.
Skipping window in row 93 due to incorrect window length.
Skipping window in row 114 due to incorrect window length.
Skipping row 117 due to duplicate primer sequences.
Skipping window in row 124 due to incorrect window length.
Skipping window in row 126 due to incorrect window length.
Skipping window in row 137 due to incorrect window length.
Skipping window in row 141 due to incorrect window length.
Skipping window in row 14

In [1]:
#首先进行数据增强500长度。
# 生成标签序列。
#one-hot编码。

import pandas as pd
import random

# 文件路径
input_files = ["9_train_data.csv", "9_val_data.csv", "9_test_data.csv"]
output_files = ["train_data_500.csv", "val_data_500.csv", "test_data_500.csv"]

# 序列与标签的对应规则
primer_to_label = {
    "F3_sequence": 4,
    "F2_sequence": 3,
    "LF_reverse_sequence": 2,
    "F1_reverse_sequence": 1,
    "B1_sequence": 5,
    "LB_sequence": 6,
    "B2_reverse_sequence": 7,
    "B3_reverse_sequence": 8
}

# 碱基到 one-hot 编码的映射
base_to_one_hot = {
    'A': '1000',
    'T': '0100',
    'C': '0010',
    'G': '0001'
}

# 生成标签序列逻辑
def generate_label_sequence(row):
    enhanced_sequence = row['enhanced_sequence']
    label_sequence = [0] * 500  # 修改标签序列长度为500

    for primer, label in primer_to_label.items():
        primer_sequence = row.get(primer, "")
        if primer_sequence and isinstance(primer_sequence, str) and primer_sequence in enhanced_sequence:
            start = enhanced_sequence.index(primer_sequence)
            end = start + len(primer_sequence)
            for i in range(start, end):
                label_sequence[i] = label

    return ''.join(map(str, label_sequence))  # 转为字符串格式

# 将 enhanced_sequence 转换为 one-hot 编码
def convert_to_one_hot(sequence):
    if not isinstance(sequence, str):
        return ''  # 如果序列无效，返回空字符串

    one_hot_sequence = []
    for base in sequence:
        one_hot_sequence.append(base_to_one_hot.get(base, '0000'))  # 非标准碱基填充 '0000'
    return ''.join(one_hot_sequence)  # 连接为一个长字符串

def find_all_occurrences(sequence, sub_sequence):
    """找到所有子序列的位置"""
    positions = []
    pos = -1
    while True:
        pos = sequence.find(sub_sequence, pos + 1)
        if pos == -1:
            break
        positions.append(pos)
    return positions

# 滑动窗口增强和处理文件
def process_file(input_file, output_file):
    # 读取原始数据
    data = pd.read_csv(input_file)
    enhanced_data = []

    # 遍历每一行数据
    for index, row in data.iterrows():
        ori_sequence = row['ori_sequence']
        F3_sequence = row['F3_sequence']
        B3_reverse_sequence = row['B3_reverse_sequence']

        try:
            # 检查是否存在重复的引物序列
            f3_positions = find_all_occurrences(ori_sequence, F3_sequence)
            b3_positions = find_all_occurrences(ori_sequence, B3_reverse_sequence)
            
            if len(f3_positions) > 1 or len(b3_positions) > 1:
                print(f"Skipping row {index} due to duplicate primer sequences.")
                continue
                
            if len(f3_positions) == 0 or len(b3_positions) == 0:
                print(f"Skipping row {index} due to missing primer sequences.")
                continue

            start_A = f3_positions[0]
            end_A = b3_positions[0] + len(B3_reverse_sequence)

            # 如果序列 A 的长度大于 500bp，跳过该行
            if end_A - start_A > 500:
                print(f"Skipping row {index} due to sequence A length > 500.")
                continue

            # 初始化窗口
            start_window = start_A
            end_window = start_window + 500  # 修改窗口大小为500

            # 如果窗口超出原始序列范围，调整起点
            if end_window > len(ori_sequence):
                start_window = max(0, len(ori_sequence) - 500)  # 修改为500
                end_window = start_window + 500  # 修改为500

            # 滑动窗口增强
            while start_window >= 0 and end_window >= end_A:
                # 检查窗口起点是否为负值
                if start_window < 0:
                    print(f"Skipping window in row {index} due to negative start position.")
                    break
                    
                window_sequence = ori_sequence[start_window:end_window]
                
                # 检查窗口长度
                if len(window_sequence) != 500:  # 修改检查长度为500
                    print(f"Skipping window in row {index} due to incorrect window length.")
                    break

                # 创建增强行
                new_row = row.copy()
                new_row['enhanced_sequence'] = window_sequence
                new_row['label_sequence'] = generate_label_sequence(new_row)
                new_row['one_hot_encoded'] = convert_to_one_hot(window_sequence)
                new_row = new_row.drop(labels=['ori_sequence'])
                enhanced_data.append(new_row)

                # 滑动窗口向左移动 10bp
                start_window -= 10  # 修改步长为10bp
                end_window = start_window + 500  # 修改为500

        except Exception as e:
            print(f"Error processing row {index}: {str(e)}")
            continue

    # 将增强数据转换为 DataFrame
    enhanced_df = pd.DataFrame(enhanced_data)

    # 统计每个 primer_segment 的数量，并补充不足的部分
    primer_segment_counts = enhanced_df['primer_segment'].value_counts()

    # 对于每个 primer_segment，检查是否有不足25条的情况
    for primer_segment, count in primer_segment_counts.items():
        if count < 25:
            # 找到该 primer_segment 对应的所有行
            rows_to_copy = enhanced_df[enhanced_df['primer_segment'] == primer_segment]
            # 随机复制并补充到25条
            additional_rows = rows_to_copy.sample(25 - count, replace=True)
            enhanced_df = pd.concat([enhanced_df, additional_rows], ignore_index=True)

    # 保存增强数据
    enhanced_df.to_csv(output_file, index=False)
    print(f"Processed data saved to {output_file}")

# 对每个文件进行处理
for input_file, output_file in zip(input_files, output_files):
    process_file(input_file, output_file)


Skipping window in row 1 due to incorrect window length.
Skipping window in row 5 due to incorrect window length.
Skipping row 9 due to duplicate primer sequences.
Skipping window in row 11 due to incorrect window length.
Skipping window in row 12 due to incorrect window length.
Skipping window in row 14 due to incorrect window length.
Skipping window in row 18 due to incorrect window length.
Skipping window in row 19 due to incorrect window length.
Skipping window in row 23 due to incorrect window length.
Skipping window in row 34 due to incorrect window length.
Skipping window in row 35 due to incorrect window length.
Skipping window in row 36 due to incorrect window length.
Skipping window in row 51 due to incorrect window length.
Skipping window in row 52 due to incorrect window length.
Skipping window in row 54 due to incorrect window length.
Skipping window in row 72 due to incorrect window length.
Skipping window in row 75 due to incorrect window length.
Skipping window in row 9

In [2]:
#去除标签序列不合理的数据，每一格标签必须包含431578
import pandas as pd

# 定义一个函数来处理每个 CSV 文件
def process_file(file_path):
    # 读取 CSV 文件
    df = pd.read_csv(file_path)
    
    # 定义需要保留的数字
    required_digits = {'4', '3', '1', '5', '7', '8'}
    
    # 使用 apply 来判断每一行的 'label_sequence' 是否包含所有需要的数字
    def contains_all_required_digits(label_sequence):
        return all(digit in label_sequence for digit in required_digits)
    
    # 过滤掉不包含所有需要的数字的行
    filtered_df = df[df['label_sequence'].apply(contains_all_required_digits)]
    
    # 直接保存回原文件
    filtered_df.to_csv(file_path, index=False)

# 列出所有需要处理的文件路径
file_paths = [
    'test_data_500.csv',
    'train_data_300.csv',
    'train_data_500.csv',
    'val_data_300.csv',
    'val_data_500.csv',
    'test_data_300.csv'
]

# 对每个文件进行处理
for file_path in file_paths:
    process_file(file_path)

print("所有文件已更新。")


所有文件已更新。
