## Extract PTM from raw data

In [17]:
import pandas as pd

PTM = "Phosphoserine_0.7"
targetAA = 'S'

file_path = r'C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\raw_data_processed.csv'
data = pd.read_csv(file_path)

filter_condition = data['Modified residue'].str.contains(PTM, na=False)


filtered_data = data[filter_condition]

output_path = rf'C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_extracted.csv'
filtered_data.to_csv(output_path, index=False)

## Extract PTM positions

In [10]:
import pandas as pd
import re


#file_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_extracted.csv"
file_path = r"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\non-histone acetylation.csv"
df = pd.read_csv(file_path)


#pattern_PTM = rf"MOD_RES (\d+); /note=\"{PTM}.*?\""
pattern_PTM = r"MOD_RES (\d+); /note=\"N6-acetyllysine\""


def extract_positions(cell):
    PTM_numbers = re.findall(pattern_PTM, cell)
    PTM_numbers.sort(key=int)
    return ', '.join(PTM_numbers) if PTM_numbers else pd.NA



df['Position'] = df['Modified residue'].apply(extract_positions)


df.dropna(subset=['Position'], inplace=True)

#new_file_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_with_positions.csv"
new_file_path = r"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\non-histone acetylation_with_positions.csv"
df.to_csv(new_file_path, index=False)





TypeError: expected string or bytes-like object

## Check if all positions are extracted from the sequences

In [244]:
import pandas as pd

file1 = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_extracted.csv"
file2 = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_with_positions.csv"

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)


entries_file1 = set(df1['Entry'])
entries_file2 = set(df2['Entry'])

unique_entries = entries_file1 - entries_file2

print("第一个文件中有而第二个文件中没有的Entry：")
for entry in unique_entries:
    print(entry)


第一个文件中有而第二个文件中没有的Entry：


## Add labels, masks, possition numbers, and tragart amino acid numbers

In [245]:
import pandas as pd
import numpy as np

# 加载数据
file_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_with_positions.csv"
df = pd.read_csv(file_path)

def create_label_and_mask(df):
    labels = []
    masks = []
    pos_nums = []  
    targetAA_nums = []
    
    for index, row in df.iterrows():
        length = row['Length']
        sequence = row['Sequence']
        
        # 创建标签数组，初始全为0
        label = np.zeros(length, dtype=int)
        
        # 如果Position列不为空，则将对应位置设为1
        if pd.notnull(row['Position']):
            positions = [int(pos) - 1 for pos in str(row['Position']).split(', ')]  # 从1开始的位置转换为0开始
            label[positions] = 1
        
        labels.append(label)
        pos_nums.append(np.sum(label))
        
        # 创建mask数组，Y氨基酸位置为1，其他为0
        mask = np.array([1 if amino_acid == targetAA else 0 for amino_acid in sequence], dtype=int)
        masks.append(mask)
        targetAA_nums.append(np.sum(mask))
    
    # 将标签和mask转换为逗号分隔的字符串形式，并添加到DataFrame中
    df['Label'] = [','.join(map(str, label)) for label in labels]
    df['Mask'] = [','.join(map(str, mask)) for mask in masks]
    df['Pos_num'] = pos_nums
    df['TargetAA_num'] = targetAA_nums
    
    return df

# 调用函数处理DataFrame
df = create_label_and_mask(df)

# 保存修改后的DataFrame到CSV文件
output_file_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_with_labels.csv"  
df.to_csv(output_file_path, index=False)



## Check if labels and mask consistent with positions and targetAA

In [246]:
import pandas as pd

# 加载CSV文件
file_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_with_labels.csv"  
df = pd.read_csv(file_path)



def verify_positions_and_count(sequence, positions, target_count):
    # 分解位置为整数列表，考虑到可能有多个位置
    pos_list = [int(pos) for pos in str(positions).split(',') if pos.strip().isdigit()]
    
    # 验证指定位置上的氨基酸是否为'Y'
    positions_correct = all(sequence[pos - 1].upper() == targetAA for pos in pos_list)
    
    # 计算序列中'Y'的数量
    targetAA_count = sequence.upper().count(targetAA)
    
    # 验证数量是否与目标数量匹配
    count_correct = (targetAA_count == target_count)
    
    return positions_correct, count_correct, targetAA_count

# 应用验证函数到每行，并将结果添加到DataFrame
df[['Positions_Correct', 'Count_Correct', 'Actual_targetAA_Count']] = df.apply(
    lambda row: verify_positions_and_count(row['Sequence'], row['Position'], row['TargetAA_num']), 
    axis=1, result_type='expand'
)

# 筛选出不符合条件的行
incorrect_rows = df[~(df['Positions_Correct'] & df['Count_Correct'])]

# 打印不符合条件的行
if not incorrect_rows.empty:
    print("不符合条件的行：")
    print(incorrect_rows)
else:
    print("所有行都符合条件。")


所有行都符合条件。


## csv to fasta for CD-HIT preparation

In [247]:
import pandas as pd

def csv_to_fasta_with_mapping(df, fasta_file_path, mapping_file_path):
    with open(fasta_file_path, 'w') as fasta_file, open(mapping_file_path, 'w') as mapping_file:
        # 写入映射文件的表头
        mapping_file.write("Entry,Modified residue,Position,Label,Mask,Pos_num,TargetAA_num\n")
        for _, row in df.iterrows():
            # 构建FASTA描述行，仅包含唯一标识符
            description = f">{row['Entry']}"
            sequence = row['Sequence']
            # 写入FASTA文件
            fasta_file.write(f"{description}\n{sequence}\n\n")
            # 写入映射文件
            mapping_info = f"{row['Entry']},{row['Modified residue']},{row['Position']},{row['Label']},{row['Mask']},{row['Pos_num']},{row['TargetAA_num']}\n"
            mapping_file.write(mapping_info)

# 指定输出文件路径
fasta_file_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_with_labels.fasta"  
mapping_file_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_mapping.csv"  

# 加载CSV数据
df = pd.read_csv(rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_with_labels.csv"  )

# 调用函数，生成FASTA和映射文件
csv_to_fasta_with_mapping(df, fasta_file_path, mapping_file_path)


## fasta to csv 

In [18]:
from Bio import SeqIO
import pandas as pd

# 动态构建原始CSV文件路径
original_csv_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_with_labels.csv"
df = pd.read_csv(original_csv_path)

# 动态构建去冗余后的FASTA文件路径
clustered_fasta_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_with_labels_clustered.fasta"
clustered_ids = [record.id for record in SeqIO.parse(clustered_fasta_path, 'fasta')]

# 筛选出原始CSV中存在于去冗余FASTA文件中的记录
filtered_df = df[df['Entry'].isin(clustered_ids)]

# 计算Pos_num和TargetAA_num列的总和
pos_num_sum = filtered_df['Pos_num'].sum()
target_aa_num_sum = filtered_df['TargetAA_num'].sum()

# 获取总行数（不包括表头）
total_rows = filtered_df.shape[0]

# 打印计算结果
print(f"Total sum of Pos_num: {pos_num_sum}")
print(f"Total sum of TargetAA_num: {target_aa_num_sum}")
print(f"Total number of rows (excluding header): {total_rows}")

# 动态构建保存筛选后记录的CSV文件路径，并保存
filtered_csv_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_clustered.csv"
filtered_df.to_csv(filtered_csv_path, index=False)


Total sum of Pos_num: 14347
Total sum of TargetAA_num: 168907
Total number of rows (excluding header): 3970


## Train test split

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

# 加载数据集
data_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_clustered.csv"
data = pd.read_csv(data_path)

# 将Number of Positive Samples大于5的情况合并为一个组，并确保所有分组标签为字符串类型
data['Pos_num_grouped'] = data['Pos_num'].apply(lambda x: '>5' if x > 5 else str(x))

# 初始化StratifiedShuffleSplit，用于保证每个分组的测试集比例为0.2
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# 使用StratifiedShuffleSplit进行初步划分
for train_index, test_index in sss.split(np.zeros(data.shape[0]), data['Pos_num_grouped']):
    data['Set'] = 'train'  # 先将所有样本标记为训练集
    data.loc[test_index, 'Set'] = 'test'  # 然后将分配到测试集的样本标记为测试集

# 检查训练集和测试集的实际比例
test_ratio = data['Set'].value_counts(normalize=True)['test']

# 如果实际测试集比例偏离0.2，进行调整
desired_test_ratio = 0.2
if test_ratio > desired_test_ratio:
    # 测试集过多，需要从测试集移动到训练集
    excess_test_samples = int((test_ratio - desired_test_ratio) * data.shape[0])
    test_indices = data[data['Set'] == 'test'].index
    move_to_train = np.random.choice(test_indices, size=excess_test_samples, replace=False)
    data.loc[move_to_train, 'Set'] = 'train'
elif test_ratio < desired_test_ratio:
    # 测试集过少，需要从训练集移动到测试集
    excess_train_samples = int((desired_test_ratio - test_ratio) * data.shape[0])
    train_indices = data[data['Set'] == 'train'].index
    move_to_test = np.random.choice(train_indices, size=excess_train_samples, replace=False)
    data.loc[move_to_test, 'Set'] = 'test'

# 删除临时列
data.drop('Pos_num_grouped', axis=1, inplace=True)

data = data.sort_values(by='Set', ascending=False)

# 保存修改后的数据集
output_path = rf"C:\Users\mengl\OneDrive - City University of Hong Kong - Student\Desktop\PTM2_data\{PTM}_clustered_splited.csv"
data.to_csv(output_path, index=False)

output_path

# 分别获取训练集和测试集的子集
train_data = data[data['Set'] == 'train']
test_data = data[data['Set'] == 'test']

# 对训练集进行统计
train_pos_num_sum = train_data['Pos_num'].sum()
train_target_aa_num_sum = train_data['TargetAA_num'].sum()
train_total_rows = train_data.shape[0]

# 打印训练集的统计结果
print("Training Set Statistics:")
print(f"Total sum of Pos_num: {train_pos_num_sum}")
print(f"Total sum of TargetAA_num: {train_target_aa_num_sum}")
print(f"Total number of rows: {train_total_rows}")

# 对测试集进行统计
test_pos_num_sum = test_data['Pos_num'].sum()
test_target_aa_num_sum = test_data['TargetAA_num'].sum()
test_total_rows = test_data.shape[0]

# 打印测试集的统计结果
print("\nTest Set Statistics:")
print(f"Total sum of Pos_num: {test_pos_num_sum}")
print(f"Total sum of TargetAA_num: {test_target_aa_num_sum}")
print(f"Total number of rows: {test_total_rows}")




Training Set Statistics:
Total sum of Pos_num: 11436
Total sum of TargetAA_num: 135563
Total number of rows: 3176

Test Set Statistics:
Total sum of Pos_num: 2911
Total sum of TargetAA_num: 33344
Total number of rows: 794
