In [17]:
import pandas as pd
from pathlib import Path

# 1. 根目录（请确保文件存在）
base = Path(r"D:\111\GO_KEGG_Results\intersections")

files = {
    "MG_GO" : "M.MG...R.MG...S.MG_GO_Result.csv",
    "MG_KEGG": "M.MG...R.MG...S.MG_KEGG_Result.csv",
    "ALL_GO"  : "All.Samples.Intersection_GO_Result.csv",
    "ALL_KEGG": "All.Samples.Intersection_KEGG_Result.csv",
    "AG_GO"  : "R.AG...S.AG_GO_Result.csv",
    "AG_KEGG": "R.AG...S.AG_KEGG_Result.csv",
}

def top50(path):
    df = pd.read_csv(base / path)
    if "p.adjust" in df.columns:
        df = df.sort_values("p.adjust").head(20)
    else:                           # 若无 p.adjust 列，则直接取前 50 行
        df = df.head(50)
    return df[["ID", "Description"]].dropna()

GO_all  = top50(files["ALL_GO"])
GO_mg   = top50(files["MG_GO"])
GO_ag   = top50(files["AG_GO"])
KEGG_all = top50(files["ALL_KEGG"])
KEGG_mg  = top50(files["MG_KEGG"])
KEGG_ag  = top50(files["AG_KEGG"])

In [18]:
KEGG_all

Unnamed: 0,ID,Description
0,mmu04820,Cytoskeleton in muscle cells - Mus musculus (h...
1,mmu04510,Focal adhesion - Mus musculus (house mouse)
2,mmu04512,ECM-receptor interaction - Mus musculus (house...
3,mmu05414,Dilated cardiomyopathy - Mus musculus (house m...
4,mmu04974,Protein digestion and absorption - Mus musculu...
5,mmu04270,Vascular smooth muscle contraction - Mus muscu...
6,mmu05410,Hypertrophic cardiomyopathy - Mus musculus (ho...
7,mmu05205,Proteoglycans in cancer - Mus musculus (house ...
8,mmu05412,Arrhythmogenic right ventricular cardiomyopath...
9,mmu04933,AGE-RAGE signaling pathway in diabetic complic...


In [19]:
KEGG_mg

Unnamed: 0,ID,Description
0,mmu04820,Cytoskeleton in muscle cells - Mus musculus (h...
1,mmu04512,ECM-receptor interaction - Mus musculus (house...
2,mmu04510,Focal adhesion - Mus musculus (house mouse)
3,mmu04151,PI3K-Akt signaling pathway - Mus musculus (hou...
4,mmu05414,Dilated cardiomyopathy - Mus musculus (house m...
5,mmu05205,Proteoglycans in cancer - Mus musculus (house ...
6,mmu05412,Arrhythmogenic right ventricular cardiomyopath...
7,mmu04270,Vascular smooth muscle contraction - Mus muscu...
8,mmu04072,Phospholipase D signaling pathway - Mus muscul...
9,mmu04921,Oxytocin signaling pathway - Mus musculus (hou...


In [21]:
# 3. 集合
GO_sets   = (set(GO_all["ID"]),  set(GO_mg["ID"]),  set(GO_ag["ID"]))
KEGG_sets = (set(KEGG_all["ID"]), set(KEGG_mg["ID"]), set(KEGG_ag["ID"]))

id2name_GO   = dict(zip(pd.concat([GO_all,  GO_mg,  GO_ag])["ID"],
                        pd.concat([GO_all,  GO_mg,  GO_ag])["Description"]))
id2name_KEGG = dict(zip(pd.concat([KEGG_all, KEGG_mg, KEGG_ag])["ID"],
                        pd.concat([KEGG_all, KEGG_mg, KEGG_ag])["Description"]))

def overlap(all_set, mg_set, ag_set):
    shared = all_set & mg_set & ag_set
    mg_u   = mg_set - all_set - ag_set
    ag_u   = ag_set - all_set - mg_set
    return shared, mg_u, ag_u

GO_shared,  GO_mg_u,  GO_ag_u  = overlap(*GO_sets)
KEGG_shared, KEGG_mg_u, KEGG_ag_u = overlap(*KEGG_sets)

def build_rows(shared, mg_u, ag_u, mapper, tag):
    """返回 dict: 列标题 -> 列值 list"""
    return {
        f"All-shared-{tag}-ID"   : sorted(shared),
        f"All-shared-{tag}-name" : [mapper[i] for i in sorted(shared)],
        f"MG-unique-{tag}-ID"    : sorted(mg_u),
        f"MG-unique-{tag}-name"  : [mapper[i] for i in sorted(mg_u)],
        f"AG-unique-{tag}-ID"    : sorted(ag_u),
        f"AG-unique-{tag}-name"  : [mapper[i] for i in sorted(ag_u)],
    }

rows = {**build_rows(GO_shared,   GO_mg_u,   GO_ag_u,   id2name_GO,   "GO"),
        **build_rows(KEGG_shared, KEGG_mg_u, KEGG_ag_u, id2name_KEGG, "KEGG")}

# 4. 让每个条目占一个单独的格子
max_len = max(len(v) for v in rows.values())
for k, v in rows.items():
    rows[k] = v + [""] * (max_len - len(v))   # 右侧填充空格

df_out = pd.DataFrame(rows)
out_path = base / "GO_KEGG_overlap_summary_split.csv"
df_out.to_csv(out_path, index=False, encoding="utf-8-sig")

print("✅ 已生成:", out_path)

✅ 已生成: D:\111\GO_KEGG_Results\intersections\GO_KEGG_overlap_summary_split.csv


In [23]:
import pandas as pd

base = r"D:\111\GO_KEGG_Results\intersections"

files = {
    'R_GO': 'R.MG...R.AG_GO_Result.csv',
    'R_KEGG': 'R.MG...R.AG_KEGG_Result.csv',
    'S_GO': 'S.MG...S.AG_GO_Result.csv',
    'S_KEGG': 'S.MG...S.AG_KEGG_Result.csv'
}

def top30(path):
    df = pd.read_csv(f'{base}/{path}')
    if 'p.adjust' in df.columns:
        df = df.sort_values('p.adjust')
    return df.head(30)[['ID','Description','p.adjust' if 'p.adjust' in df.columns else df.columns[2]]]

# Load top30 tables
R_GO_top = top30(files['R_GO'])
R_KEGG_top = top30(files['R_KEGG'])
S_GO_top = top30(files['S_GO'])
S_KEGG_top = top30(files['S_KEGG'])

display_dataframe_to_user('Rabbit top30 GO (MG∩AG)', R_GO_top)
display_dataframe_to_user('Rabbit top30 KEGG (MG∩AG)', R_KEGG_top)
display_dataframe_to_user('Sugar glider top30 GO (MG∩AG)', S_GO_top)
display_dataframe_to_user('Sugar glider top30 KEGG (MG∩AG)', S_KEGG_top)

# Compute intersections between MG and AG within species
def load_split(path):
    df = pd.read_csv(f'{base}/{path}')
    if 'Group' in df.columns:  # assume maybe Group column indicates MG or AG
        mg = set(df[df['Group'].str.contains('MG')]['ID'].head(30))
        ag = set(df[df['Group'].str.contains('AG')]['ID'].head(30))
        return mg, ag
    else:
        # if already combined top from intersection, use ID set directly
        return set(df['ID'].head(30)), set(df['ID'].head(30))

# For provided intersection files, sets are identical; we analyze overlap between species instead
R_GO_set = set(R_GO_top['ID'])
S_GO_set = set(S_GO_top['ID'])
R_KEGG_set = set(R_KEGG_top['ID'])
S_KEGG_set = set(S_KEGG_top['ID'])

# Overlap between species
GO_overlap = R_GO_set & S_GO_set
KEGG_overlap = R_KEGG_set & S_KEGG_set

display_dataframe_to_user('GO overlap Rabbit vs Sugar glider', pd.DataFrame({'ID': list(GO_overlap)}))
display_dataframe_to_user('KEGG overlap Rabbit vs Sugar glider', pd.DataFrame({'ID': list(KEGG_overlap)}))


NameError: name 'display_dataframe_to_user' is not defined

In [24]:
# species_overlap.py
# ----------------------------------------
# 统计 兔 (R) vs 蜜袋鼯 (S) 在 MG∩AG 交集的 GO / KEGG 独享与共有条目
# ----------------------------------------
import pandas as pd
from pathlib import Path

# === 1. 参数 ===
BASE_DIR = Path(r"D:\111\GO_KEGG_Results\intersections")            # 如 CSV 在其他路径，改这里
TOP_N    = 20                    # 取前 N 行

FILES = {
    "R_GO"   : "R.MG...R.AG_GO_Result.csv",
    "R_KEGG" : "R.MG...R.AG_KEGG_Result.csv",
    "S_GO"   : "S.MG...S.AG_GO_Result.csv",
    "S_KEGG" : "S.MG...S.AG_KEGG_Result.csv",
}

# === 2. 读取并取前 N 条 ===
def load_top(path):
    df = pd.read_csv(BASE_DIR / path)
    if "p.adjust" in df.columns:
        df = df.sort_values("p.adjust")
    return df.head(TOP_N)[["ID", "Description"]].dropna()

R_GO   = load_top(FILES["R_GO"])
R_KEGG = load_top(FILES["R_KEGG"])
S_GO   = load_top(FILES["S_GO"])
S_KEGG = load_top(FILES["S_KEGG"])

# === 3. 构建集合 & ID→name 映射 ===
def to_set(df):   return set(df["ID"])
def id2name(*dfs):
    concat = pd.concat(dfs).drop_duplicates("ID")
    return dict(zip(concat["ID"], concat["Description"]))

GO_sets   = (to_set(R_GO),   to_set(S_GO))
KEGG_sets = (to_set(R_KEGG), to_set(S_KEGG))

id2name_GO   = id2name(R_GO,   S_GO)
id2name_KEGG = id2name(R_KEGG, S_KEGG)

# === 4. 计算共有 / 独享 ===
def compare(r_set, s_set):
    shared = r_set & s_set
    r_only = r_set - s_set
    s_only = s_set - r_set
    return shared, r_only, s_only

GO_shared,  GO_r_only,  GO_s_only  = compare(*GO_sets)
KEGG_shared, KEGG_r_only, KEGG_s_only = compare(*KEGG_sets)

# === 5. 组装列（每个格子一个条目） ===
def build_cols(shared, r_only, s_only, mapper, tag):
    return {
        f"Shared-{tag}-ID" : sorted(shared),
        f"Shared-{tag}-name" : [mapper[i] for i in sorted(shared)],
        f"Rabbit-only-{tag}-ID" : sorted(r_only),
        f"Rabbit-only-{tag}-name" : [mapper[i] for i in sorted(r_only)],
        f"Sugarglider-only-{tag}-ID" : sorted(s_only),
        f"Sugarglider-only-{tag}-name" : [mapper[i] for i in sorted(s_only)],
    }

cols = {**build_cols(GO_shared, GO_r_only, GO_s_only, id2name_GO, "GO"),
        **build_cols(KEGG_shared, KEGG_r_only, KEGG_s_only, id2name_KEGG, "KEGG")}

# 对齐列长
max_len = max(len(v) for v in cols.values())
for k, v in cols.items():
    cols[k] = v + [""] * (max_len - len(v))

out_df = pd.DataFrame(cols)
out_path = BASE_DIR / "GO_KEGG_species_overlap.csv"
out_df.to_csv(out_path, index=False, encoding="utf-8-sig")

print("✅ 结果已保存:", out_path.resolve())


✅ 结果已保存: D:\111\GO_KEGG_Results\intersections\GO_KEGG_species_overlap.csv
