In [2]:
import pandas as pd
import os

In [20]:
repseq_df = pd.read_csv("repseq.microcosms.tmp.tsv", sep = "\t")
accessions_df = repseq_df.loc[:,["Query", "EC", "domain"]]
accessions_df["ISEUK"] = accessions_df["domain"].str.contains("euk")

prok_annot = pd.read_csv("prok_annot_hmm.permissive.repseq.tsv", sep = "\t")
euk_annot = pd.read_csv("euk_annot_hmm.permissive.repseq.tsv", sep = "\t")
kog_labels = pd.concat([prok_annot, euk_annot])

In [50]:
ec_koginfo = pd.read_csv("/data/luojaa/kegg_stats/KOG_merged.tsv", sep = "\t")
ec_koginfo["EC_single"] = ec_koginfo["EC"].apply(lambda x: x.split("|"))
ec_koginfo = ec_koginfo.explode("EC_single")
ec_2kog = ec_koginfo[ec_koginfo.EC_single != "none" ].loc[:,["kogid", "name", "EC_single"]]
ec_2kogs = ec_2kog.groupby("EC_single")["kogid"].apply(lambda x: "|".join(set(x))).reset_index().rename(columns={"EC_single":"EC","kogid":"KOGS"})
# drop incomplete ec numbers
ec_2kogs = ec_2kogs[~ec_2kogs.EC.str.contains("-")]
ec_2kog_dict = ec_2kogs.set_index("EC").to_dict()["KOGS"]
ec_2kog_dict["3.6.4.12"] = ec_2kog_dict["5.6.2.3"]
def skip_ec(ec):
    try:
        return ec_2kog_dict[ec]
    except:
        return "None"

In [51]:
accessions_df["KOGS"] = accessions_df.EC.apply(lambda x: skip_ec(x))


In [66]:
accessions_df = accessions_df.set_index("Query")
kog_labels = kog_labels.set_index("Query")

In [68]:
accessions_df_eckog = pd.merge(accessions_df, kog_labels, left_index = True, right_index = True, how = "left")
accessions_df_eckog = accessions_df_eckog.fillna("None")

In [87]:
def top_kog(s1, s2):
    rv = []
    l1, l2 = list(s1), list(s2)
    for i in range(len(s1)):
        # to do - in future, if DL/UC 1:1 is not priority, allow l1 = None for 'discovery' of new microcosms
        if (l1[i] == "None") or (l2[i] == "None"):
            rv.append("None")
        else:
            k = "None"
            for kog in l2[i].split("|"):
                if kog in l1[i].split("|"):
                    k = kog
                    break
            rv.append(k)
    return rv
                

In [117]:
top_kogs = top_kog(accessions_df_eckog.KOGS, accessions_df_eckog.KOG)
accessions_df_eckog["KOG_microcosm"] = top_kogs
accessions_df_eckog["EC_KOG"] = accessions_df_eckog.EC + "_" + accessions_df_eckog.KOG_microcosm
accessions_df_eckog = accessions_df_eckog.reset_index()

In [118]:
acc_counts = accessions_df_eckog.groupby(["EC_KOG", "ISEUK"])["Query"].count().reset_index()
# filter out small clades
acc_counts_filtered = acc_counts[((acc_counts["ISEUK"] == True) & (acc_counts["Query"] > 5)) | (
                                    (acc_counts["ISEUK"] == False) & (acc_counts["Query"] > 3))]
# require both euk and prok clades of proper size
ec_bothdom = acc_counts_filtered.groupby("EC_KOG")["ISEUK"].apply(lambda x:len(x.unique()) == 2)
ec_bothdom_list = list(ec_bothdom[ec_bothdom].index)
accessions_df_rel = accessions_df_eckog[accessions_df_eckog["EC_KOG"].isin(ec_bothdom_list)]

In [120]:
accessions_outdf = accessions_df_rel.groupby(["EC_KOG", "ISEUK"])["Query"].apply(lambda x: "\n".join(x)).reset_index()

In [124]:
ecs = list(accessions_outdf["EC_KOG"])
iseuks = list(accessions_outdf["ISEUK"])
uids = list(accessions_outdf["Query"])

In [126]:
outpathroot = "/data/luojaa/eukgen/processing/microcosm_dl_minHGT/"
for i in range(len(accessions_outdf)):
    ec, iseuk, uid = ecs[i], iseuks[i], uids[i]
    if not iseuk:
        os.system(f"mkdir {outpathroot}{ec}")
        outpath = f"{outpathroot}{ec}/{ec}.target.acc"
        with open(outpath, "w") as outfile:
            print(uid, file=outfile)
    else:
        outpath = f"{outpathroot}{ec}/{ec}.query.acc"
        with open(outpath, "w") as outfile:
            print(uid, file=outfile)