In [2]:
import os
from Bio import SeqIO
import pandas as pd
import numpy as np
import ete3

base_stats = "/data/luojaa/uid_stats/"

In [5]:
# # forgot to combine these files earlier.
# base = "/data/luojaa/kegg/kog_uid/"
# kog2uidpaths = [base + tsv for tsv in os.listdir(base)]
# with open("/data/luojaa/uid_stats/uid2descriptors.tsv","w") as outfile:
#     for path in kog2uidpaths:
#         with open(path, "r") as f:
#             for line in f:
#                 kog, uid, descriptor = line.strip().split("\t")
#                 print("\t".join([uid, descriptor]), file=outfile)

In [98]:
#base_stats = "/data/luojaa/uid_stats/"
genes2uid = base_stats + "kegg_genes.mappings.csv"
uid2kogs = base_stats + "uid2kogs.csv"
uid2taxid = base_stats + "uid2taxids.combined.tsv"
uid2descriptor = base_stats + "uid2descriptors.tsv"

genes2uid_df = pd.read_csv(genes2uid).rename(columns={"UNIPROT_ID":"UID"}).set_index("UID")
# missing descriptors for genes w/o UID, and some UIDs lacking descriptors
uid2kogs_df = pd.read_csv(uid2kogs).iloc[:, 1:].rename(columns = {"0":"UID","1":"KOG"}).set_index("UID")
uid2taxid_df = pd.read_csv(uid2taxid, sep = "\t", header = None).rename(columns={0:"UID",1:"TAXID"}).set_index("UID")
uid2desc_df = pd.read_csv(uid2descriptor, sep = "\t", header = None).rename(columns={0:"UID",1:"NAME"}).set_index("UID")

In [99]:
kogs_df = pd.merge(uid2kogs_df, genes2uid_df, on="UID", how = "outer")
kogs_df = pd.merge(kogs_df, uid2taxid_df, on = "UID", how = "outer")
kogs_df = pd.merge(kogs_df, uid2desc_df, on="UID", how = "outer")
# ENTRY is from KEGG gene scraping, and many of these genes don't have UIDs
# 50% genes don't have "KOGs" for this reason, so fill in gaps with "ENTRY"
# Also, ~600k "KOGs" map to UIDs that were "discovered", or not originally mapped from the KEGG gene scraping 
kogs_df["KOGID"] = kogs_df["KOG"].fillna(kogs_df["ENTRY"]) 

kogs_df["isnull"] = kogs_df.index.isnull()
null_count = kogs_df.reset_index().loc[:,["isnull", "KOGID"]].groupby("KOGID").sum().astype(int).reset_index().rename(columns={"isnull":"UIDS_MISSING"})
kogs_df = kogs_df.reset_index()
kogs_df = pd.merge(kogs_df, null_count, on = "KOGID", how = "outer")

kogs_df


Unnamed: 0,UID,KOG,KEGG_ID,ENTRY,ALIAS,KEGG_CDS,KEGG_ORG,TAXID,NAME,KOGID,isnull,UIDS_MISSING
0,A0A022Q707,K00001,egt:105975019,K00001,,105975019,EGT,4155.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604
1,A0A022R7F5,K00001,egt:105960042,K00001,,105960042,EGT,4155.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604
2,A0A059A4S5,K00001,egr:104425885,K00001,,104425885,EGR,71139.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604
3,A0A059A5P0,K00001,egr:104425887,K00001,,104425887,EGR,71139.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604
4,A0A059ZDV9,K00001,abw:BL01_05170,K00001,,BL01_05170,ABW,470.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604
...,...,...,...,...,...,...,...,...,...,...,...,...
27895405,,,tpul:TPB0596_18180,K27109,ctaF,TPB0596_18180,TPUL,,,K27109,True,482
27895406,,,tsd:MTP03_30470,K27109,,MTP03_30470,TSD,,,K27109,True,482
27895407,,,whr:OG579_04520,K27109,,OG579_04520,WHR,,,K27109,True,482
27895408,,,yia:LO772_24195,K27109,,LO772_24195,YIA,,,K27109,True,482


In [9]:
kogs_df.to_csv(base_stats + "kogs_df.tsv", sep = "\t", index=None)

### map kogs to 3M uids that we include before realignment (eukaryotic)

In [100]:
euk_uid2kogs = "/data/luojaa/uid_stats/euk_uid2kogs.csv"

In [101]:
# kog_fastas_path = "/data/luojaa/kog_fastas_backup/"
# kog_fastas = os.listdir(kog_fastas_path)
# with open("/data/luojaa/uid_stats/uid2kogs_fasta.csv", "w") as outfile:
#     for file in kog_fastas:
#         kog = file.strip(".fasta")
#         with open(kog_fastas_path + file, "r") as handle:
#             for record in SeqIO.parse(handle, 'fasta'):
#                 uid = record.id
#                 print(",".join([uid, kog]), file=outfile)

In [102]:
euk_uid2kogs_df = pd.read_csv(euk_uid2kogs, header = None).rename(columns = {0:"UID", 1:"KOGID"}).set_index("UID")
euk_uid2kogs_df["ISEUK"] = [True] * len(euk_uid2kogs_df)
euk_uid2kogs_df = euk_uid2kogs_df.loc[:,"ISEUK"]

In [103]:
kogs_df_summary_in = pd.read_csv(base_stats + "kogs_df.tsv", sep = "\t").set_index("UID")

  kogs_df_summary_in = pd.read_csv(base_stats + "kogs_df.tsv", sep = "\t").set_index("UID")


In [104]:
kogs_df_iseuk = pd.merge(kogs_df_summary_in, euk_uid2kogs_df, on = "UID", how="outer")
kogs_df_iseuk.fillna({"ISEUK":False}, inplace = True)
euk_count = kogs_df_iseuk.reset_index().loc[:,["ISEUK", "KOGID"]].groupby("KOGID").sum().astype(int).reset_index().rename(columns={"ISEUK":"EUKCOUNT"})

kogs_df_iseuk = kogs_df_iseuk.reset_index()
kogs_df_summary = pd.merge(kogs_df_iseuk, euk_count, on = "KOGID", how = "outer")

  kogs_df_iseuk.fillna({"ISEUK":False}, inplace = True)


In [105]:
ncbi = ete3.NCBITaxa()

In [106]:
taxids = list(kogs_df_summary[~kogs_df_summary["TAXID"].isnull()]["TAXID"])
taxid2name = ncbi.get_taxid_translator(taxids)

In [107]:
def translate_taxid(name, dic):
    try:
        return dic[name]
    except:
        return np.nan

In [108]:
kogs_df_taxa = kogs_df_summary["TAXID"].apply(lambda x: translate_taxid(x, taxid2name))

In [109]:
kogs_df_crucial = kogs_df_summary.loc[:, ["UID", "KOGID", "FUNCTION", "ALIAS", "KEGG_ORG", "ISEUK"]]
kogs_df_crucial["TAXID"] = kogs_df_summary["TAXID"].fillna(0).astype(int)
kogs_df_crucial["SPECIES"] = kogs_df_taxa
kogs_df_searchable = kogs_df_crucial.fillna("none")

In [110]:
kogs_df_crucial.to_csv(base_stats + "kogs_df_crucial.tsv", sep = "\t", index=None)
kogs_df_searchable.to_csv(base_stats + "kogs_df_searchable.tsv", sep = "\t", index=None)

In [111]:
kogs_df_searchable

Unnamed: 0,UID,KOGID,FUNCTION,ALIAS,KEGG_ORG,ISEUK,TAXID,SPECIES
0,A0A022Q707,K00001,alcohol dehydrogenase [EC:1.1.1.1],none,EGT,True,4155,Erythranthe guttata
1,A0A022R7F5,K00001,alcohol dehydrogenase [EC:1.1.1.1],none,EGT,True,4155,Erythranthe guttata
2,A0A059A4S5,K00001,alcohol dehydrogenase [EC:1.1.1.1],none,EGR,True,71139,Eucalyptus grandis
3,A0A059A5P0,K00001,alcohol dehydrogenase [EC:1.1.1.1],none,EGR,True,71139,Eucalyptus grandis
4,A0A059ZDV9,K00001,alcohol dehydrogenase [EC:1.1.1.1],none,ABW,False,470,Acinetobacter baumannii
...,...,...,...,...,...,...,...,...
27895405,none,K27109,none,ctaF,TPUL,False,0,none
27895406,none,K27109,none,none,TSD,False,0,none
27895407,none,K27109,none,none,WHR,False,0,none
27895408,none,K27109,none,none,YIA,False,0,none


### format cluster_stats

In [None]:
fasta_stats_df = kogs_df_summary.loc[:,["KOGID", "UID", "UIDS_MISSING", "EUKCOUNT"]].groupby(["KOGID","UIDS_MISSING", "EUKCOUNT"]).count().reset_index().rename(columns={"UID":"UID_COUNT"}).set_index("KOGID")
cluster_size = pd.DataFrame(kogs_df_summary.loc[:,["KOGID"]].groupby("KOGID").size()).rename(columns={0:"KOGSIZE"})
fasta_stats_out = pd.merge(cluster_size, fasta_stats_df, on = "KOGID", how = "outer").reset_index()
fasta_stats_out["EUK_FRACTION"] = 100 * fasta_stats_out["EUKCOUNT"]/fasta_stats_out["UID_COUNT"]

In [None]:
fasta_stats_out[fasta_stats_out["EUKCOUNT"].between(1,10)]

Unnamed: 0,KOGID,KOGSIZE,UIDS_MISSING,EUKCOUNT,UID_COUNT,%chosen
4,K00005,1619,807,4,812,0.492611
22,K00024,6526,2619,10,3907,0.255951
40,K00042,2632,1248,10,1384,0.722543
55,K00060,2408,1034,1,1374,0.072780
62,K00067,7109,2835,1,4274,0.023397
...,...,...,...,...,...,...
25713,K26382,1,0,1,1,100.000000
25714,K26383,7,2,5,5,100.000000
25890,K26560,13,4,9,9,100.000000
26140,K26810,23,20,3,3,100.000000


In [187]:
fasta_stats_out.to_csv("/data/luojaa/uid_stats/cluster_stats.csv", sep = ",", index = None)

### format/explore  kegg "CATEGORIES"
to do: stack on pathway/module/reaction info

In [70]:
kegg_pathways = pd.read_csv("/data/luojaa/kegg/kegg_pathways.58345.tsv", sep = "\t").loc[:,["ENTRY", "PATHWAY_ID", "PATHWAY_NAME"]].rename(
    columns = {"PATHWAY_ID":"CATEGORY_ID", "PATHWAY_NAME":"CATEGORY_NAME", "ENTRY":"KOGID"})

kegg_rxns = pd.read_csv("/data/luojaa/kegg/kegg_reactions.tsv", sep = "\t").rename(columns = {"REACTION_ID":"CATEGORY_ID", "REACTION_NAME":"CATEGORY_NAME", "ENTRY":"KOGID"})
kegg_modules = pd.read_csv("/data/luojaa/kegg/kegg_modules.tsv", sep = "\t").rename(columns = {"MODULE_ID":"CATEGORY_ID", "MODULE_NAME":"CATEGORY_NAME", "ENTRY":"KOGID"})

# category: pathway/module/reaction; CID: map#####, CNAME: ___

In [74]:
kegg_categories_df = pd.concat([kegg_pathways, kegg_rxns, kegg_modules], axis = 0).dropna().reset_index().iloc[:, 1:]

In [75]:
kegg_categories_df.to_csv("/data/luojaa/kegg/kegg_categories.tsv", sep = "\t", index = None)

### make kogs for each category easily retrievable

In [80]:
kegg_categories = pd.read_csv("/data/luojaa/kegg/kegg_categories.tsv", sep = "\t")

In [81]:
kegg_categories["KOGIDS"] = kegg_categories.groupby(["CATEGORY_NAME", "CATEGORY_ID"])["KOGID"].transform(lambda x: ",".join(x))

In [82]:
kegg_categories["NUM_KOGS"] = kegg_categories.groupby(["CATEGORY_ID"]).transform(lambda x: len(x))["KOGID"]

In [83]:
kegg_categories_groups = kegg_categories.loc[:,["CATEGORY_NAME", "CATEGORY_ID", "KOGIDS", "NUM_KOGS"]].drop_duplicates().dropna()

In [84]:
kegg_categories_groups.to_csv("/data/luojaa/kegg/kegg_categories_searchable.tsv", sep = "\t", index = None)

In [94]:
getkogs_wdescr(kogs_df_searchable, "malate")

NameError: name 'kogs_df_searchable' is not defined

In [93]:
# explore dataframes
def filter_iseuk_kogid(df, kogid):
    return df[(df["KOGID"] == kogid) & (df["ISEUK"])]
#filter_iseuk_kogid(kogs_df_searchable, "K00029")
def getkogs_wdescr(df, descr):
    return df[df["FUNCTION"].str.contains(descr)]["KOGID"].value_counts()
#getkogs_wdescr(kogs_df_searchable, "malate dehydrogenase")
def getkogs_wcategory(df, category):
    df_filtered = df[df["CATEGORY_NAME"].str.contains(category)]
    kogs_s = df_filtered["KOGIDS"].str.split(",")
    names_s = df_filtered["CATEGORY_NAME"]
    return dict(zip(names_s, kogs_s))
getkogs_wcategory(kegg_categories_groups, "malate")

{'(S)-malate:NAD+ oxidoreductase': ['K00024', 'K00025', 'K00026'],
 '(S)-malate:NAD+ oxidoreductase (decarboxylating)': ['K00027', 'K00028'],
 '(S)-malate:NADP+ oxidoreductase(oxaloacetate-decarboxylating)': ['K00029'],
 '(S)-malate:NADP+ oxidoreductase': ['K00051'],
 '(2R,3S)-3-methylmalate:NAD+ oxidoreductase': ['K00052'],
 '(2R,3S)-3-isopropylmalate:NAD+ oxidoreductase': ['K00052', 'K21360'],
 '(S)-malate:oxygen oxidoreductase': ['K00116'],
 '(S)-malate:quinone oxidoreductase': ['K00116'],
 '(S)-malate:FAD oxidoreductase': ['K00116'],
 '(S)-malate hydro-lyase (fumarate-forming)': ['K01675',
  'K01676',
  'K01677',
  'K01678',
  'K01679',
  'K01774'],
 '2-isopropylmalate hydro-lyase': ['K01702', 'K01703', 'K01704', 'K21359'],
 '3-isopropylmalate hydro-lyase': ['K01702', 'K01703', 'K01704', 'K21359'],
 '(2R,3S)-3-isopropylmalate hydro-lyase (2-isopropylmaleate-forming)': ['K01702',
  'K01703',
  'K01704',
  'K21359'],
 '(R)-2-methylmalate hydro-lyase (2-methylmaleate-forming)': ['K017