# Chromothripsis 状态统计


In [1]:
import pandas as pd

# 读取TSV文件
df = pd.read_csv('SV_graph.simulated/all_samples_results.tsv', sep='\t')

# 找出plot_path不为NA的行
valid_plots = df[df['plot_path'].notna() & (df['plot_path'] != 'NA')]
total_valid = len(valid_plots)

# 统计各种染色体断裂状态的数量
high_confidence = len(valid_plots[valid_plots['chromothripsis_status'] == 'High Confidence'])
low_confidence = len(valid_plots[valid_plots['chromothripsis_status'] == 'Low Confidence'])
not_significant = len(valid_plots[valid_plots['chromothripsis_status'] == 'Not Significant'])

# 输出结果
print(f"plot_path不为NA的行数: {total_valid}")
print("\n在这些行中:")
print(f"High Confidence: {high_confidence} ({high_confidence/total_valid*100:.2f}%)")
print(f"Low Confidence: {low_confidence} ({low_confidence/total_valid*100:.2f}%)")
print(f"Not Significant: {not_significant} ({not_significant/total_valid*100:.2f}%)")

# 检查是否有其他状态
other = total_valid - (high_confidence + low_confidence + not_significant)
if other > 0:
    print(f"其他状态: {other} ({other/total_valid*100:.2f}%)")

plot_path不为NA的行数: 22287

在这些行中:
High Confidence: 637 (2.86%)
Low Confidence: 326 (1.46%)
Not Significant: 21324 (95.68%)


# High Confidence Std 失败的统计

In [3]:
def advanced_filter(df, criteria_dict, match_type='exact'):
    """
    根据多个条件筛选dataframe
    
    参数：
    df: pandas DataFrame - 要筛选的数据框
    criteria_dict: dict - 键为列名，值为匹配值的字典
    match_type: str - 匹配类型，可选'exact'(精确匹配)或'contains'(包含匹配)
    
    返回：
    pandas DataFrame - 包含匹配行的数据框
    """
    result = df.copy()
    
    for column, value in criteria_dict.items():
        # 检查列名是否存在
        if column not in df.columns:
            raise ValueError(f"列名 '{column}' 在数据框中不存在")
        
        # 根据匹配类型应用筛选
        if match_type == 'exact':
            result = result[result[column] == value]
        elif match_type == 'contains':
            # 确保列和值都是字符串类型
            result = result[result[column].astype(str).str.contains(str(value), na=False)]
        else:
            raise ValueError("match_type必须是'exact'或'contains'")
    
    # 打印匹配结果信息
    print(f"根据指定条件共找到 {len(result)} 行匹配")
    
    return result


In [10]:
# 使用示例:
import pandas as pd

df = pd.read_csv('SV_graph.simulated/all_samples_results.tsv', sep='\t')
multiple_criteria = advanced_filter(df, 
                                   {'HC_standard': 'intra_chr_num_6'}, 
                                   match_type='exact')
print(multiple_criteria['plot_path'])

根据指定条件共找到 489 行匹配
23        TCGA-OR-A5J7_chr1_SV_plot.png
24        TCGA-OR-A5J7_chr2_SV_plot.png
31        TCGA-OR-A5J7_chr9_SV_plot.png
34       TCGA-OR-A5J7_chr12_SV_plot.png
79       TCGA-OR-A5LO_chr11_SV_plot.png
                      ...              
32500     TCGA-N8-A56S_chr2_SV_plot.png
32536    TCGA-N6-A4VE_chr15_SV_plot.png
32591     TCGA-VD-A8KH_chr1_SV_plot.png
33258     TCGA-V4-A9F0_chr1_SV_plot.png
33273    TCGA-V4-A9F0_chr16_SV_plot.png
Name: plot_path, Length: 489, dtype: object
