## Part-1 First we get the essential gene simulated by FBA with same threshold 90% growth reduction.

In [1]:
import os
import pandas as pd
from tqdm import tqdm

In [None]:
# ========== Paths ==========
essential_file = "/data1/xpgeng/cross_pathogen/one_two_knockout/iML1515_1KO_growth"
combo_file = "/data1/xpgeng/cross_pathogen/FBA/iML1515_all.csv"
out_file = "/data1/xpgeng/cross_pathogen/FBA/non-essential.csv"

# ========== Step 1: Read essential genes (label==1) ==========
essential_genes = []

with open(essential_file, "r") as f:
    next(f)  # skip header line
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = [p.strip() for p in line.split(",")]
        # expected: gene, val1, val2, label
        gene = parts[0]
        label = parts[-1]

        try:
            if int(float(label)) == 1:
                essential_genes.append(gene)
        except:
            continue

essential_genes = sorted(set(essential_genes))
essential_set = set(essential_genes)

print(f"Essential genes (label==1): n={len(essential_genes)}")
print(essential_genes)

# ========== Step 2: Read combination CSV ==========
df = pd.read_csv(combo_file, header=None)
print(f"\nLoaded combos: shape={df.shape}")

# ========== Step 3: Filter rows that contain NO essential genes ==========
keep_idx = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Filtering rows"):
    genes_in_row = set()

    for x in row.values:
        if pd.isna(x):
            continue
        s = str(x).strip()
        if not s or s.lower() == "nan":
            continue
        genes_in_row.add(s)

    # keep the row if it has zero overlap with essential genes
    if genes_in_row.isdisjoint(essential_set):
        keep_idx.append(idx)

non_essential_df = df.loc[keep_idx].reset_index(drop=True)
print(f"Rows with NO essential genes: {len(non_essential_df)}")

# ========== Step 4: Write output ==========
os.makedirs(os.path.dirname(out_file), exist_ok=True)
non_essential_df.to_csv(out_file, header=False, index=False)
print(f"Saved: {out_file}")

Essential genes (label==1): n=196
['b0003', 'b0004', 'b0025', 'b0029', 'b0031', 'b0052', 'b0054', 'b0071', 'b0072', 'b0074', 'b0084', 'b0085', 'b0086', 'b0087', 'b0088', 'b0089', 'b0090', 'b0091', 'b0096', 'b0103', 'b0109', 'b0131', 'b0133', 'b0134', 'b0142', 'b0154', 'b0159', 'b0166', 'b0173', 'b0174', 'b0175', 'b0179', 'b0180', 'b0181', 'b0182', 'b0185', 'b0242', 'b0243', 'b0369', 'b0386', 'b0414', 'b0415', 'b0417', 'b0420', 'b0421', 'b0423', 'b0522', 'b0523', 'b0524', 'b0635', 'b0639', 'b0641', 'b0720', 'b0750', 'b0774', 'b0775', 'b0776', 'b0777', 'b0778', 'b0908', 'b0914', 'b0915', 'b0918', 'b1062', 'b1069', 'b1091', 'b1092', 'b1093', 'b1094', 'b1098', 'b1131', 'b1136', 'b1208', 'b1210', 'b1215', 'b1260', 'b1261', 'b1262', 'b1263', 'b1264', 'b1277', 'b1281', 'b1288', 'b1662', 'b1693', 'b1740', 'b1812', 'b2019', 'b2020', 'b2021', 'b2022', 'b2023', 'b2024', 'b2025', 'b2026', 'b2103', 'b2153', 'b2312', 'b2315', 'b2316', 'b2323', 'b2329', 'b2400', 'b2472', 'b2476', 'b2478', 'b2499', 'b

Filtering rows:   3%|█▎                                      | 19716247/576107619 [20:08<9:03:25, 17064.14it/s]

In [2]:
# ========== Paths ==========
essential_file = "/data1/xpgeng/cross_pathogen/one_two_knockout/iML1515_1KO_growth"
combo_file = "/data1/xpgeng/cross_pathogen/FBA/iML1515_all.csv"
out_file = "/data1/xpgeng/cross_pathogen/FBA/iML1515_all_with-essential.csv"

# ========== Step 1: Read essential genes (label==1) ==========
essential_genes = []

with open(essential_file, "r") as f:
    next(f)  # skip header line
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = [p.strip() for p in line.split(",")]
        # expected: gene, val1, val2, label
        gene = parts[0]
        label = parts[-1]

        try:
            if int(float(label)) == 1:
                essential_genes.append(gene)
        except:
            continue

essential_genes = sorted(set(essential_genes))
essential_set = set(essential_genes)

print(f"Essential genes (label==1): n={len(essential_genes)}")
print(essential_genes)

# ========== Step 2: Read combination CSV ==========
df = pd.read_csv(combo_file, header=None)
print(f"\nLoaded combos: shape={df.shape}")

# ========== Step 3: Filter rows that contain NO essential genes ==========
keep_idx = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Filtering rows"):
    genes_in_row = set()

    for x in row.values:
        if pd.isna(x):
            continue
        s = str(x).strip()
        if not s or s.lower() == "nan":
            continue
        genes_in_row.add(s)

    # keep the row if it has at least one overlap with essential genes
    if not genes_in_row.isdisjoint(essential_set):
        keep_idx.append(idx)

essential_df = df.loc[keep_idx].reset_index(drop=True)
print(f"Rows with essential genes: {len(essential_df)}")

# ========== Step 4: Write output ==========
os.makedirs(os.path.dirname(out_file), exist_ok=True)
essential_df.to_csv(out_file, header=False, index=False)
print(f"Saved: {out_file}")

Essential genes (label==1): n=196
['b0003', 'b0004', 'b0025', 'b0029', 'b0031', 'b0052', 'b0054', 'b0071', 'b0072', 'b0074', 'b0084', 'b0085', 'b0086', 'b0087', 'b0088', 'b0089', 'b0090', 'b0091', 'b0096', 'b0103', 'b0109', 'b0131', 'b0133', 'b0134', 'b0142', 'b0154', 'b0159', 'b0166', 'b0173', 'b0174', 'b0175', 'b0179', 'b0180', 'b0181', 'b0182', 'b0185', 'b0242', 'b0243', 'b0369', 'b0386', 'b0414', 'b0415', 'b0417', 'b0420', 'b0421', 'b0423', 'b0522', 'b0523', 'b0524', 'b0635', 'b0639', 'b0641', 'b0720', 'b0750', 'b0774', 'b0775', 'b0776', 'b0777', 'b0778', 'b0908', 'b0914', 'b0915', 'b0918', 'b1062', 'b1069', 'b1091', 'b1092', 'b1093', 'b1094', 'b1098', 'b1131', 'b1136', 'b1208', 'b1210', 'b1215', 'b1260', 'b1261', 'b1262', 'b1263', 'b1264', 'b1277', 'b1281', 'b1288', 'b1662', 'b1693', 'b1740', 'b1812', 'b2019', 'b2020', 'b2021', 'b2022', 'b2023', 'b2024', 'b2025', 'b2026', 'b2103', 'b2153', 'b2312', 'b2315', 'b2316', 'b2323', 'b2329', 'b2400', 'b2472', 'b2476', 'b2478', 'b2499', 'b

Filtering rows: 100%|███████████████████████████████████████| 576107619/576107619 [8:45:05<00:00, 18285.95it/s]


Rows with essential genes: 195387920
Saved: /data1/xpgeng/cross_pathogen/FBA/iML1515_all_with-essential.csv


In [4]:
import pandas as pd

# 文件路径
file_path = '/data1/xpgeng/cross_pathogen/FBA/iML1515_all_non_essential.csv'

# 读取 CSV 文件
df = pd.read_csv(file_path, header=None)  # 如果没有表头，使用 header=None

# 获取最后一列的列名
last_column = df.columns[-1]

# 计算最后一列中0和1的数量
count_zeros = (df[last_column] == 0).sum()
count_ones = (df[last_column] == 1).sum()

# 计算总行数
total_rows = len(df)

print(f" total rows: {total_rows}")
print(f" number of 0: {count_zeros}")
print(f" number of 1: {count_ones}")

 total rows: 380719699
 number of 0: 380596069
 number of 1: 123630


In [7]:
print(df.head(10))

       0      1      2  3
0  b0033  b0036  b0037  0
1  b0033  b0036  b0038  0
2  b0033  b0036  b0040  0
3  b0033  b0036  b0046  0
4  b0033  b0036  b0047  0
5  b0033  b0036  b0048  0
6  b0033  b0036  b0049  0
7  b0033  b0036  b0061  0
8  b0033  b0036  b0062  0
9  b0033  b0036  b0063  0


In [5]:
import pandas as pd

# 文件路径
file_path = '/data1/xpgeng/cross_pathogen/FBA/iML1515_all_with-essential.csv'

# 读取 CSV 文件
df = pd.read_csv(file_path, header=None)  # 如果没有表头，使用 header=None

# 获取最后一列的列名
last_column = df.columns[-1]

# 计算最后一列中0和1的数量
count_zeros = (df[last_column] == 0).sum()
count_ones = (df[last_column] == 1).sum()

# 计算总行数
total_rows = len(df)

print(f" total rows: {total_rows}")
print(f" number of 0: {count_zeros}")
print(f" number of 1: {count_ones}")

 total rows: 195387920
 number of 0: 0
 number of 1: 195387920


## Part-2 we use all the positive samples and same the negative samples to build no-essential triples dataset

In [6]:
# 从刚才读取的文件里随机选择为0 的，所有label为1 的，作为不包含essential gene 的三基因组合subsampling
import os
import pandas as pd

# ====== Input ======
NONESSENTIAL_CSV = '/data1/xpgeng/cross_pathogen/FBA/iML1515_all_non_essential.csv'  # 4列，无表头: g1,g2,g3,y
SEED = 42

# ====== Output ======
SAMPLED_CSV = '/data1/xpgeng/cross_pathogen/sci_rep_revision_20260113/Baseline/triples_no_essential_123630x2-0_123630-1.csv'

# 读取：无表头，手动指定列名
df = pd.read_csv(NONESSENTIAL_CSV, header=None, names=['g1','g2','g3','y'])

# 保证 y 是 int
df['y'] = df['y'].astype(int)

# 抽样数量
N1 = 123630
N0 = 123630*2

df0 = df[df['y'] == 0]
df1 = df[df['y'] == 1]

n0 = min(N0, len(df0))
n1 = min(N1, len(df1))

if n0 < N0:
    print(f"[WARN] y=0 only has {len(df0)} rows, sampling {n0}")
if n1 < N1:
    print(f"[WARN] y=1 only has {len(df1)} rows, sampling {n1}")

sample0 = df0.sample(n=n0, random_state=SEED, replace=False)
sample1 = df1.sample(n=n1, random_state=SEED, replace=False)

df_sample = pd.concat([sample0, sample1], axis=0).sample(frac=1.0, random_state=SEED).reset_index(drop=True)

# 保存：不写 index，不写表头（保持和原始格式一致）
os.makedirs(os.path.dirname(SAMPLED_CSV), exist_ok=True)
df_sample.to_csv(SAMPLED_CSV, index=False, header=False)

print("✅ Saved sampled dataset:", SAMPLED_CSV)
print("Shape:", df_sample.shape)
print("Label counts:\n", df_sample['y'].value_counts())


✅ Saved sampled dataset: /data1/xpgeng/cross_pathogen/sci_rep_revision_20260113/Baseline/triples_no_essential_123630x2-0_123630-1.csv
Shape: (370890, 4)
Label counts:
 y
0    247260
1    123630
Name: count, dtype: int64


## Part-2 we extract 123630 positive samples to build with-essential triples dataset

In [7]:
# 从刚才读取的文件里随机选择为0 的，所有label为1 的，作为不包含essential gene 的三基因组合subsampling
import os
import pandas as pd

# ====== Input ======
NONESSENTIAL_CSV = '/data1/xpgeng/cross_pathogen/FBA/iML1515_all_with-essential.csv'  # 4列，无表头: g1,g2,g3,y
SEED = 42

# ====== Output ======
SAMPLED_CSV = '/data1/xpgeng/cross_pathogen/sci_rep_revision_20260113/Baseline/triples_with_essential_0-0_123630-1.csv'

# 读取：无表头，手动指定列名
df = pd.read_csv(NONESSENTIAL_CSV, header=None, names=['g1','g2','g3','y'])

# 保证 y 是 int
df['y'] = df['y'].astype(int)

# 抽样数量
N1 = 123630
N0 = 0

df0 = df[df['y'] == 0]
df1 = df[df['y'] == 1]

n0 = min(N0, len(df0))
n1 = min(N1, len(df1))

if n0 < N0:
    print(f"[WARN] y=0 only has {len(df0)} rows, sampling {n0}")
if n1 < N1:
    print(f"[WARN] y=1 only has {len(df1)} rows, sampling {n1}")

sample0 = df0.sample(n=n0, random_state=SEED, replace=False)
sample1 = df1.sample(n=n1, random_state=SEED, replace=False)

df_sample = pd.concat([sample0, sample1], axis=0).sample(frac=1.0, random_state=SEED).reset_index(drop=True)

# 保存：不写 index，不写表头（保持和原始格式一致）
os.makedirs(os.path.dirname(SAMPLED_CSV), exist_ok=True)
df_sample.to_csv(SAMPLED_CSV, index=False, header=False)

print("✅ Saved sampled dataset:", SAMPLED_CSV)
print("Shape:", df_sample.shape)
print("Label counts:\n", df_sample['y'].value_counts())


✅ Saved sampled dataset: /data1/xpgeng/cross_pathogen/sci_rep_revision_20260113/Baseline/triples_with_essential_0-0_123630-1.csv
Shape: (123630, 4)
Label counts:
 y
1    123630
Name: count, dtype: int64


## The following code is analyzing these new triples.


In [1]:
# 从刚才读取的文件里随机选择label为1 的，作为不包含essential gene 的三基因组合all
import os
import pandas as pd

# ====== Input ======
NONESSENTIAL_CSV = '/data1/xpgeng/cross_pathogen/FBA/iML1515_all_non-essential.csv'  # 4列，无表头: g1,g2,g3,y
SEED = 42

# ====== Output ======
SAMPLED_CSV = '/data1/xpgeng/cross_pathogen/FBA/E.coli_K-12_MG1655_lethal_triple_no_individual_lethal_genes_123630_count_by_FBA_20260213_v.01.csv'

# 读取：无表头，手动指定列名
df = pd.read_csv(NONESSENTIAL_CSV, header=None, names=['g1','g2','g3','y'])

# 保证 y 是 int
df['y'] = df['y'].astype(int)

# 抽样数量
N0 = 0
N1 = 123630

df0 = df[df['y'] == 0]
df1 = df[df['y'] == 1]

n0 = min(N0, len(df0))
n1 = min(N1, len(df1))

if n0 < N0:
    print(f"[WARN] y=0 only has {len(df0)} rows, sampling {n0}")
if n1 < N1:
    print(f"[WARN] y=1 only has {len(df1)} rows, sampling {n1}")

sample0 = df0.sample(n=n0, random_state=SEED, replace=False)
sample1 = df1.sample(n=n1, random_state=SEED, replace=False)

df_sample = pd.concat([sample0, sample1], axis=0).sample(frac=1.0, random_state=SEED).reset_index(drop=True)

# 保存：不写 index，不写表头（保持和原始格式一致）
os.makedirs(os.path.dirname(SAMPLED_CSV), exist_ok=True)
df_sample.to_csv(SAMPLED_CSV, index=False, header=False)

print("✅ Saved sampled dataset:", SAMPLED_CSV)
print("Shape:", df_sample.shape)
print("Label counts:\n", df_sample['y'].value_counts())

✅ Saved sampled dataset: /data1/xpgeng/cross_pathogen/FBA/E.coli_K-12_MG1655_lethal_triple_no_individual_lethal_genes_123630_count_by_FBA_20260213_v.01.csv
Shape: (123630, 4)
Label counts:
 y
1    123630
Name: count, dtype: int64


In [2]:
import pandas as pd

# 你的文件路径
SAMPLED_CSV = '/data1/xpgeng/cross_pathogen/FBA/E.coli_K-12_MG1655_lethal_triple_no_individual_lethal_genes_123630_count_by_FBA_20260213_v.01.csv'

# 读取数据
# 假设前三列是基因，这里我们直接读取前三列
df = pd.read_csv(SAMPLED_CSV)

# 将前三列的所有值提取出来，并转换为一维序列
# 假设你的列名分别是 'gene1', 'gene2', 'gene3' (或者根据索引提取)
# 如果没有表头，可以加上 header=None
all_genes = df.iloc[:, 0:3].values.flatten()

# 转为集合去重
unique_genes = set(all_genes)

print(f"总计行数: {len(df)}")
print(f"涉及到的基因总数（去重后）: {len(unique_genes)}")

# 如果你想查看具体的基因列表，可以取消下面这一行的注释
# print(list(unique_genes))

总计行数: 123629
涉及到的基因总数（去重后）: 1318


In [5]:
import pandas as pd
from itertools import combinations
from collections import Counter

# 1. 读取数据
SAMPLED_CSV = '/data1/xpgeng/cross_pathogen/FBA/E.coli_K-12_MG1655_lethal_triple_no_individual_lethal_genes_123630_count_by_FBA_20260213_v.01.csv'
df = pd.read_csv(SAMPLED_CSV)

# 2. 提取每行的基因对（无序组合）
pair_counts = Counter()

for _, row in df.iloc[:, 0:3].iterrows():
    # 对每一行的 3 个基因进行排序并取两两组合，确保 (A, B) 和 (B, A) 被视为同一个对
    genes = sorted(row.values)
    pairs = list(combinations(genes, 2))
    pair_counts.update(pairs)

# 3. 转换为 DataFrame 方便分析
pair_df = pd.DataFrame(pair_counts.items(), columns=['Gene_Pair', 'Frequency'])
pair_df = pair_df.sort_values(by='Frequency', ascending=False).reset_index(drop=True)

# 4. 输出结果
print("出现频率最高的前 10 个基因对：")
print(pair_df.head(200))

# 可选：筛选出出现次数大于 1 的“频繁对”
frequent_pairs = pair_df[pair_df['Frequency'] > 1]
print(f"\n共有 {len(frequent_pairs)} 组对在不同行中重复出现。")

出现频率最高的前 10 个基因对：
          Gene_Pair  Frequency
0    (b0078, b3671)       1316
1    (b0048, b1606)       1316
2    (b0763, b2424)       1316
3    (b0077, b3671)       1316
4    (b0002, b3940)       1316
..              ...        ...
195  (b2422, b2529)          9
196  (b1592, b1779)          9
197  (b2926, b2935)          9
198  (b0763, b2529)          9
199  (b2913, b2926)          9

[200 rows x 2 columns]

共有 54767 组对在不同行中重复出现。


In [6]:
import pandas as pd
from itertools import combinations
from collections import Counter

# 1. 路径设置
SAMPLED_CSV = '/data1/xpgeng/cross_pathogen/FBA/E.coli_K-12_MG1655_lethal_triple_no_individual_lethal_genes_123630_count_by_FBA_20260213_v.01.csv'
OUTPUT_CSV = '/data1/xpgeng/cross_pathogen/FBA/E.coli_K-12-MG1655_lethal_new_triple_frequent_gene_pairs_analysis.csv'

# 2. 读取并处理
print("正在读取数据...")
df = pd.read_csv(SAMPLED_CSV)

# 统计基因对频率
pair_counts = Counter()

print("正在提取基因对并进行统计...")
# 仅取前三列，并逐行处理
for row in df.iloc[:, 0:3].values:
    # 排序以保证无序性 (A,B) == (B,A)
    genes = sorted(row)
    # 生成 3 条 pairs: (G1,G2), (G1,G3), (G2,G3)
    pairs = combinations(genes, 2)
    pair_counts.update(pairs)

# 3. 转换为 DataFrame
# 将元组 (GeneA, GeneB) 拆分为两列，方便后续分析
pair_data = []
for (g1, g2), freq in pair_counts.items():
    if freq > 1:  # 核心逻辑：过滤掉只出现 1 次的
        pair_data.append([g1, g2, freq])

result_df = pd.DataFrame(pair_data, columns=['Gene_A', 'Gene_B', 'Frequency'])

# 4. 排序并保存
result_df = result_df.sort_values(by='Frequency', ascending=False)
result_df.to_csv(OUTPUT_CSV, index=False)

print(f"处理完成！")
print(f"总计发现出现次数 > 1 的基因对数量: {len(result_df)}")
print(f"结果已保存至: {OUTPUT_CSV}")

正在读取数据...
正在提取基因对并进行统计...
处理完成！
总计发现出现次数 > 1 的基因对数量: 54767
结果已保存至: /data1/xpgeng/cross_pathogen/FBA/E.coli_K-12-MG1655_lethal_new_triple_frequent_gene_pairs_analysis.csv
