# Seed gene selection

Prepares the seed gene sets for the pipeline runs. Requires downloaded files from [DISGENET](https://disgenet.com/).

In [3]:
import os
import pandas as pd
from config import SEED_SETS, DATA_DIR

In [6]:
# create the seeds directory if it doesn't exist
if not os.path.exists(f"{DATA_DIR}/seeds"):
    os.makedirs(f"{DATA_DIR}/seeds")

stats = []

# and parse the downloaded disgenet files (one column, no header)
for seed_set in SEED_SETS:
    df = pd.read_csv(f"{DATA_DIR}/disgenet/DISEASES_Summary_GDA_CURATED_{seed_set['disgenet']}.tsv", sep="\t")
    df["Gene"].to_csv(f"{DATA_DIR}/seeds/{seed_set['short']}.tsv", sep="\t", index=False, header=False)
    stats.append(seed_set | {"num_genes": len(df)})

# print the stats
stats_df = pd.DataFrame(stats)
print(stats_df.to_string(index=False))

                         name short   mondo disgenet  num_genes
Amyotrophic lateral sclerosis   ALS 0004976 C0002736        127
          Lung adenocarcinoma  LUAD 0005061 C0152013        280
           Ulcerative colitis    UC 0005101 C0009324         76
                Crohn disease    CD 0005011 C0156147         78
           Huntington disease    HD 0007739 C0020179         40
