In [12]:
import os
import pandas as pd
import sys
os.chdir("/home2/s439906/project/CASP16/monomer")

In [None]:
results_dir = "/data/data1/conglab/jzhan6/CASP16/targetPDBs/Targets_oligo_interfaces_20240917/nr_interfaces/"
result_files = [result for result in os.listdir(results_dir) if result.endswith(".nr_interface.results")]
stage="1"
bad_targets= [
    "T1246",
    "T1269v1o_",
    "T1295o.",
    "H1265_",
    "T2270o",
]
if stage == "1":
    bad_targets.extend([
        "T0",
        "T2",
        "H0",
        "H2"
    ])
elif stage == "0":
    bad_targets.extend([
        "T1",
        "T2",
        "H1",
        "H2"
    ])
elif stage == "2":
    bad_targets.extend([
        "T0",
        "T1",
        "H0",
        "H1"
    ])
else:
    print("Invalid stage")
    sys.exit(1)

to_remove = []
for result_file in result_files:
    for removed_target in bad_targets:
        if result_file.startswith(removed_target):
            to_remove.append(result_file)
            break
for remove in to_remove:
    result_files.remove(remove)
result_files

In [None]:

# 创建用于存储每个统计结果的 DataFrame
min_df = pd.DataFrame()
max_df = pd.DataFrame()
mean_df = pd.DataFrame()
final_max_dockq_df = pd.DataFrame()

# 循环读取每个 TSV 文件
for result_file in result_files:
    target = result_file.split(".")[0]  # 使用文件名作为目标名称
    data = pd.read_csv(os.path.join(results_dir, result_file), sep="\t")
    # 获取同时满足 dockq_mean 列存在且 model 列中包含 'TS145' 的部分
    filtered_data = data[data['model'].str.contains('TS145', na=False)]
    
    if filtered_data.empty:
        print(f"No models containing 'TS145' found in {result_file}.")
    else:
        filtered_data.set_index('model', inplace=True)
        dockq_mean_filtered = filtered_data['dockq_mean']
        
        # 将非字符串类型转换为字符串类型，以便进行字符串操作
        dockq_mean_filtered = dockq_mean_filtered.astype(str)
        
        # 检查 dockq_mean 列中是否存在 None 或 'None' 字符串
        if dockq_mean_filtered.str.contains('None').any():
            print(f"Warning: 'dockq_mean' column in {result_file} contains 'None' as a string value.")
        
        # 检查并移除 dockq_mean 列中包含 '-' 字符的行
        if dockq_mean_filtered.str.contains('-').any():
            print(f"Warning: 'dockq_mean' column in {result_file} contains '-' as a character. Removing these rows.")
            filtered_data = filtered_data[~dockq_mean_filtered.str.contains('-')]
            dockq_mean_filtered = filtered_data['dockq_mean'].astype(str)
        
        # 将 dockq_mean 列中的值按分号拆分，每个值成为一个新列
        dockq_mean_split = dockq_mean_filtered.str.split(';', expand=True)
        dockq_mean_split.columns = [f'{target}_dockq_mean_interface_{i+1}' for i in range(dockq_mean_split.shape[1])]
        
        dockq_mean_filtered = pd.DataFrame(dockq_mean_filtered)
        result_with_split = dockq_mean_filtered.join(dockq_mean_split)
        
        # 移除 dockq_mean 列
        result_with_split.drop(columns=['dockq_mean'], inplace=True)

        # 将列转换为浮点数类型以进行统计计算
        result_with_split = result_with_split.apply(pd.to_numeric, errors='coerce')
        
        # 计算每列的最小值、最大值和均值，并存储到对应的 DataFrame 中，同时设置行索引为 target
        min_row = pd.DataFrame(result_with_split.min()).transpose()
        min_row.index = ['TS145']
        min_df = pd.concat([min_df, min_row], axis=1)
        
        max_row = pd.DataFrame(result_with_split.max()).transpose()
        max_row.index = ['TS145']
        max_df = pd.concat([max_df, max_row], axis=1)
        
        mean_row = pd.DataFrame(result_with_split.mean()).transpose()
        mean_row.index = ['TS145']
        mean_df = pd.concat([mean_df, mean_row], axis=1)

        # 计算当前文件的最终最大 dockq 值，并将其存储到 final_max_dockq_df 中
        final_max_row = pd.DataFrame({
            'min_max': min_row.max(axis=1),
            'max_max': max_row.max(axis=1),
            'mean_max': mean_row.max(axis=1)
        })
        final_max_row.index = [target]
        final_max_dockq_df = pd.concat([final_max_dockq_df, final_max_row])

# 打印每个统计结果的 DataFrame
print("Minimum values for each interface:")
print(min_df)
print("\nMaximum values for each interface:")
print(max_df)
print("\nMean values for each interface:")
print(mean_df)
print("\nFinal max dockq values for each target:")
print(final_max_dockq_df)
# save final_max_dockq_df to 
final_max_dockq_df.to_csv("./for_Jing/TS145_dockq.csv")