In [8]:

"""
Analyse pathway intersections in All_Pathway_table.csv
Outputs:
  pathway_intersections.csv
  (console) pathway counts + intersection sizes
"""
import pandas as pd
from itertools import combinations

# ---------------- 用户可修改 ----------------
CSV_FILE   = r"D:\111\CellChat\All_Pathway_table.csv"
DATASET_COL = "dataset"
PATH_COL    = "pathway_name"

MG_TAGS = ["M_MG", "R_MG", "S_MG"]      # 确认与 CSV 完全一致
AG_TAGS = ["R_AG", "R_CG", "S_AG"]

INTER_SYM = "&"   # 列名连接符；纯 ASCII 避免乱码
# -------------------------------------------

def safe_intersection(list_of_sets):
    return set.intersection(*list_of_sets) if list_of_sets else set()

# 1) 读取
df = pd.read_csv(CSV_FILE)
need = {DATASET_COL, PATH_COL}
if not need.issubset(df.columns):
    raise ValueError(f"CSV 缺列：{need - set(df.columns)}")

# 2) 每个 dataset -> pathway 集合
path_sets = {d: set(sub[PATH_COL]) for d, sub in df.groupby(DATASET_COL)}

print("\n--- 单个数据集包含的通路数量 ---")
for d, s in path_sets.items():
    print(f"{d}: {len(s)} pathways")

# 3) 任意两数据集交集
result = {}
for d1, d2 in combinations(path_sets.keys(), 2):
    inter = path_sets[d1] & path_sets[d2]
    key = f"{d1}{INTER_SYM}{d2}"
    result[key] = sorted(inter)
    print(f"{key}: {len(inter)} pathways")

# 4) 乳腺 / 顶泌腺 / 六腺体 100% 交集
mg_inter  = safe_intersection([path_sets[d] for d in MG_TAGS if d in path_sets])
ag_inter  = safe_intersection([path_sets[d] for d in AG_TAGS if d in path_sets])
six_tags  = MG_TAGS + AG_TAGS
six_inter = safe_intersection([path_sets[d] for d in six_tags if d in path_sets])

result["MG_core"]  = sorted(mg_inter)
result["AG_core"]  = sorted(ag_inter)
result["Six_core"] = sorted(six_inter)

print(f"MG_core: {len(mg_inter)} pathways")
print(f"AG_core: {len(ag_inter)} pathways")
print(f"Six_core: {len(six_inter)} pathways")

# ---------- （可选）更宽松的交集：出现 ≥2 次 ----------
# count 所有 MG 出现次数
from collections import Counter
mg_counter = Counter(p for d in MG_TAGS for p in path_sets.get(d, []))
ag_counter = Counter(p for d in AG_TAGS for p in path_sets.get(d, []))

mg_core2 = [p for p, c in mg_counter.items() if c >= 2]
ag_core2 = [p for p, c in ag_counter.items() if c >= 2]
result["MG_core2of3"] = sorted(mg_core2)
result["AG_core2of3"] = sorted(ag_core2)
print(f"MG_core2of3 (≥2/3 MG): {len(mg_core2)} pathways")
print(f"AG_core2of3 (≥2/3 AG): {len(ag_core2)} pathways")
# ------------------------------------------------------

# 5) 导出 CSV — 列对齐
max_len = max(len(v) for v in result.values())
aligned = {k: v + [""]*(max_len - len(v)) for k, v in result.items()}
pd.DataFrame(aligned).to_csv("pathway_intersections.csv", index=False, encoding="utf-8")

print("\n✔ 已写入 pathway_intersections.csv   (UTF‑8 编码)")



--- 单个数据集包含的通路数量 ---
M-MG: 61 pathways
R-AG: 37 pathways
R-CG: 29 pathways
R-MG: 45 pathways
S-AG: 37 pathways
S-MG: 30 pathways
M-MG&R-AG: 31 pathways
M-MG&R-CG: 26 pathways
M-MG&R-MG: 35 pathways
M-MG&S-AG: 29 pathways
M-MG&S-MG: 23 pathways
R-AG&R-CG: 26 pathways
R-AG&R-MG: 31 pathways
R-AG&S-AG: 20 pathways
R-AG&S-MG: 16 pathways
R-CG&R-MG: 27 pathways
R-CG&S-AG: 16 pathways
R-CG&S-MG: 14 pathways
R-MG&S-AG: 21 pathways
R-MG&S-MG: 18 pathways
S-AG&S-MG: 23 pathways
MG_core: 0 pathways
AG_core: 0 pathways
Six_core: 0 pathways
MG_core2of3 (≥2/3 MG): 0 pathways
AG_core2of3 (≥2/3 AG): 0 pathways

✔ 已写入 pathway_intersections.csv   (UTF‑8 编码)


In [10]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Compare pathway sets between MG and AG datasets.

Outputs printed:
  1) Pathways unique to (M_MG ∩ S_MG) over (R_AG ∩ S_AG)
  2) Pathways unique to (M_MG ∩ S_MG) over (R_AG ∪ S_AG)
  3) Pathways unique to (R_AG ∩ S_AG) over (M_MG ∩ S_MG)
"""

import pandas as pd

# ---------- 文件名 ----------
INT_CSV = r"D:\111\CellChat\All_Pathway_table.csv"  # 交集文件
RAW_CSV = "All_Pathway_table.csv"       # 原始路径文件
# -----------------------------

# 1) 读交集文件
int_df = pd.read_csv(INT_CSV)

# 抽取两列交集 —— 列名请与您的文件保持一致
mg_int_col = "M-MG&S-MG"
ag_int_col = "R-AG&S-AG"

if mg_int_col not in int_df.columns or ag_int_col not in int_df.columns:
    raise ValueError("请确认交集文件中存在列: "
                     f"{mg_int_col!r} 与 {ag_int_col!r}")

mg_int_set = set(int_df[mg_int_col].dropna().loc[lambda s: s != ""].unique())
ag_int_set = set(int_df[ag_int_col].dropna().loc[lambda s: s != ""].unique())

# 2) 读原始数据，用于取 R_AG ∪ S_AG 并集
raw_df = pd.read_csv(RAW_CSV)
ag_union_set = set(
    raw_df.loc[raw_df["dataset"].isin(["R-AG", "S-AG"]), "pathway_name"]
)

# ------------------ 比对 ------------------
# (1) MG交集特有 vs AG交集
uniq_mg_over_agint = mg_int_set - ag_int_set

# (2) MG交集特有 vs AG并集
uniq_mg_over_aguni = mg_int_set - ag_union_set

# (3) AG交集特有 vs MG交集
uniq_ag_over_mgint = ag_int_set - mg_int_set

# ------------------ 输出 ------------------
print(f"共有 {len(mg_int_set)} 条通路在 M_MG∩S_MG")
print(f"共有 {len(ag_int_set)} 条通路在 R_AG∩S_AG")
print(f"R_AG∪S_AG 并集共有 {len(ag_union_set)} 条通路\n")

print(f"(1) M_MG∩S_MG 独有 (相对 R_AG∩S_AG)：{len(uniq_mg_over_agint)}")
print(sorted(uniq_mg_over_agint), "\n")

print(f"(2) M_MG∩S_MG 独有 (相对 R_AG∪S_AG)：{len(uniq_mg_over_aguni)}")
print(sorted(uniq_mg_over_aguni), "\n")

print(f"(3) R_AG∩S_AG 独有 (相对 M_MG∩S_MG)：{len(uniq_ag_over_mgint)}")
print(sorted(uniq_ag_over_mgint))


ValueError: 请确认交集文件中存在列: 'M-MG&S-MG' 与 'R-AG&S-AG'