In [1]:
import os
from Bio import SeqIO
import pandas as pd
import numpy as np

In [5]:
# # forgot to combine these files earlier.
# base = "/data/luojaa/kegg/kog_uid/"
# kog2uidpaths = [base + tsv for tsv in os.listdir(base)]
# with open("/data/luojaa/uid_stats/uid2descriptors.tsv","w") as outfile:
#     for path in kog2uidpaths:
#         with open(path, "r") as f:
#             for line in f:
#                 kog, uid, descriptor = line.strip().split("\t")
#                 print("\t".join([uid, descriptor]), file=outfile)

In [6]:
base_stats = "/data/luojaa/uid_stats/"
genes2uid = base_stats + "kegg_genes.mappings.csv"
uid2kogs = base_stats + "uid2kogs.csv"
uid2taxid = base_stats + "uid2taxids.combined.tsv"
uid2descriptor = base_stats + "uid2descriptors.tsv"

genes2uid_df = pd.read_csv(genes2uid).rename(columns={"UNIPROT_ID":"UID"}).set_index("UID")
# missing descriptors for genes w/o UID, and some UIDs lacking descriptors
uid2kogs_df = pd.read_csv(uid2kogs).iloc[:, 1:].rename(columns = {"0":"UID","1":"KOG"}).set_index("UID")
uid2taxid_df = pd.read_csv(uid2taxid, sep = "\t", header = None).rename(columns={0:"UID",1:"TAXID"}).set_index("UID")
uid2desc_df = pd.read_csv(uid2descriptor, sep = "\t", header = None).rename(columns={0:"UID",1:"FUNCTION"}).set_index("UID")

In [46]:
# 400k uids from scraping don't appear in genes2uid because uniprot mapper isn't updated
len(set(uid2kogs_df.index)), len(set(genes2uid_df.index))

(12462195, 12015899)

In [72]:
sum(kogs_df["KOG"].isnull()), sum(kogs_df.index.isnull())

(15055848, 14897628)

In [104]:
sum(kogs_df["KOG"] != kogs_df["ENTRY"])

15703492

In [7]:
kogs_df = pd.merge(uid2kogs_df, genes2uid_df, on="UID", how = "outer")
kogs_df = pd.merge(kogs_df, uid2taxid_df, on = "UID", how = "outer")
kogs_df = pd.merge(kogs_df, uid2desc_df, on="UID", how = "outer")
# ENTRY is from KEGG gene scraping, and many of these genes don't have UIDs
# 50% genes don't have "KOGs" for this reason, so fill in gaps with "ENTRY"
# Also, ~600k "KOGs" map to UIDs that were "discovered", or not originally mapped from the KEGG gene scraping 
kogs_df["KOGID"] = kogs_df["KOG"].fillna(kogs_df["ENTRY"]) 

kogs_df["isnull"] = kogs_df.index.isnull()
null_count = kogs_df.reset_index().loc[:,["isnull", "KOGID"]].groupby("KOGID").sum().astype(int).reset_index().rename(columns={"isnull":"UIDS_MISSING"})
kogs_df = kogs_df.reset_index()
kogs_df = pd.merge(kogs_df, null_count, on = "KOGID", how = "outer")

kogs_df


Unnamed: 0,UID,KOG,KEGG_ID,ENTRY,ALIAS,KEGG_CDS,KEGG_ORG,TAXID,FUNCTION,KOGID,isnull,UIDS_MISSING
0,A0A022Q707,K00001,egt:105975019,K00001,,105975019,EGT,4155.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604
1,A0A022R7F5,K00001,egt:105960042,K00001,,105960042,EGT,4155.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604
2,A0A059A4S5,K00001,egr:104425885,K00001,,104425885,EGR,71139.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604
3,A0A059A5P0,K00001,egr:104425887,K00001,,104425887,EGR,71139.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604
4,A0A059ZDV9,K00001,abw:BL01_05170,K00001,,BL01_05170,ABW,470.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604
...,...,...,...,...,...,...,...,...,...,...,...,...
27895405,,,tpul:TPB0596_18180,K27109,ctaF,TPB0596_18180,TPUL,,,K27109,True,482
27895406,,,tsd:MTP03_30470,K27109,,MTP03_30470,TSD,,,K27109,True,482
27895407,,,whr:OG579_04520,K27109,,OG579_04520,WHR,,,K27109,True,482
27895408,,,yia:LO772_24195,K27109,,LO772_24195,YIA,,,K27109,True,482


In [9]:
kogs_df.to_csv(base_stats + "kogs_df.tsv", sep = "\t", index=None)

### map kogs to 3M uids that we include before realignment (eukaryotic)

In [2]:
euk_uid2kogs = "/data/luojaa/uid_stats/uid2kogs_fasta.csv"

In [3]:
# kog_fastas_path = "/data/luojaa/kog_fastas_backup/"
# kog_fastas = os.listdir(kog_fastas_path)
# with open("/data/luojaa/uid_stats/uid2kogs_fasta.csv", "w") as outfile:
#     for file in kog_fastas:
#         kog = file.strip(".fasta")
#         with open(kog_fastas_path + file, "r") as handle:
#             for record in SeqIO.parse(handle, 'fasta'):
#                 uid = record.id
#                 print(",".join([uid, kog]), file=outfile)

In [4]:
euk_uid2kogs_df = pd.read_csv(euk_uid2kogs, header = None).rename(columns = {0:"UID", 1:"KOGID"}).set_index("UID")
euk_uid2kogs_df["ISEUK"] = [True] * len(euk_uid2kogs_df)
euk_uid2kogs_df = euk_uid2kogs_df.loc[:,"ISEUK"]

In [141]:
kogs_df_summary_in = pd.read_csv(base_stats + "kogs_df.tsv", sep = "\t").set_index("UID")

  kogs_df_summary_in = pd.read_csv(base_stats + "kogs_df.tsv", sep = "\t").set_index("UID")


In [149]:
kogs_df_iseuk = pd.merge(kogs_df_summary_in, euk_uid2kogs_df, on = "UID", how="outer")
kogs_df_iseuk.fillna({"ISEUK":False}, inplace = True)
euk_count = kogs_df_iseuk.reset_index().loc[:,["ISEUK", "KOGID"]].groupby("KOGID").sum().astype(int).reset_index().rename(columns={"ISEUK":"EUKCOUNT"})

kogs_df_iseuk = kogs_df_iseuk.reset_index()
kogs_df_summary = pd.merge(kogs_df_iseuk, euk_count, on = "KOGID", how = "outer")

In [150]:
kogs_df_summary

Unnamed: 0,UID,KOG,KEGG_ID,ENTRY,ALIAS,KEGG_CDS,KEGG_ORG,TAXID,FUNCTION,KOGID,isnull,UIDS_MISSING,ISEUK,EUKCOUNT
0,A0A022Q707,K00001,egt:105975019,K00001,,105975019,EGT,4155.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604,True,143
1,A0A022R7F5,K00001,egt:105960042,K00001,,105960042,EGT,4155.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604,True,143
2,A0A059A4S5,K00001,egr:104425885,K00001,,104425885,EGR,71139.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604,True,143
3,A0A059A5P0,K00001,egr:104425887,K00001,,104425887,EGR,71139.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604,True,143
4,A0A059ZDV9,K00001,abw:BL01_05170,K00001,,BL01_05170,ABW,470.0,alcohol dehydrogenase [EC:1.1.1.1],K00001,False,1604,False,143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27895405,,,tpul:TPB0596_18180,K27109,ctaF,TPB0596_18180,TPUL,,,K27109,True,482,False,0
27895406,,,tsd:MTP03_30470,K27109,,MTP03_30470,TSD,,,K27109,True,482,False,0
27895407,,,whr:OG579_04520,K27109,,OG579_04520,WHR,,,K27109,True,482,False,0
27895408,,,yia:LO772_24195,K27109,,LO772_24195,YIA,,,K27109,True,482,False,0


In [23]:
nonunique_uids = list(kogs_df_summary[~kogs_df_summary.index.isnull()].index)

In [80]:
unique_uids = set(kogs_df_summary.index)

In [97]:
nullind_set = set(kogs_df_summary[kogs_df_summary.index.isna()].index)
len(nullind_set)

393217

In [83]:
# sets fail to filter out ~400k nan values ...
len(unique_uids), len(nonunique_uids), len(set(nonunique_uids))

(12998877, 12997782, 12605660)

In [85]:
sum(pd.Series(list(unique_uids)).isnull())

393217

In [25]:
# duplicate_uids = []
# for i in set_nonunique_uids:
#     if nonunique_uids.count(i) > 1:
#          duplicate_uids.append(i)
duplicate_uids # not a problem. some kegg genes are 100% redundant, but map to distinct NCBI entries? Usually from same/super similar species

['A1ALQ0', 'A0A6H3AE30', 'P36938']

In [29]:
kogs_df_summary[kogs_df_summary.index =="P36938"]

Unnamed: 0_level_0,KOG,KEGG_ID,ENTRY,ALIAS,KEGG_CDS,KEGG_ORG,TAXID,FUNCTION,KOGID,isnull,UIDS_MISSING,ISEUK
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
P36938,K01835,ecj:JW0675,K01835,pgm,JW0675,ECJ,83333.0,phosphoglucomutase [EC:5.4.2.2],K01835,False,3610,
P36938,K01835,eco:b0688,K01835,pgm,b0688,ECO,83333.0,phosphoglucomutase [EC:5.4.2.2],K01835,False,3610,


In [175]:
fasta_stats_df = kogs_df_summary.loc[:,["KOGID", "UID", "UIDS_MISSING", "EUKCOUNT"]].groupby(["KOGID","UIDS_MISSING", "EUKCOUNT"]).count().reset_index().rename(columns={"UID":"UID_COUNT"}).set_index("KOGID")
cluster_size = pd.DataFrame(kogs_df_summary.loc[:,["KOGID"]].groupby("KOGID").size()).rename(columns={0:"KOGSIZE"})
fasta_stats_out = pd.merge(cluster_size, fasta_stats_df, on = "KOGID", how = "outer").reset_index()

In [178]:
fasta_stats_out["%chosen"] = 100 * fasta_stats_out["EUKCOUNT"]/fasta_stats_out["UID_COUNT"]

In [186]:
K00042 = kogs_df_summary[kogs_df_summary["KOGID"] == "K00042"]
K00042[K00042["ISEUK"]]

Unnamed: 0,UID,KOG,KEGG_ID,ENTRY,ALIAS,KEGG_CDS,KEGG_ORG,TAXID,FUNCTION,KOGID,isnull,UIDS_MISSING,ISEUK,EUKCOUNT
150713,A0A088RW69,K00042,lpan:LPMP_300180,K00042,,LPMP_300180,LPAN,5679.0,2-hydroxy-3-oxopropionate reductase [EC:1.1.1.60],K00042,False,1248,True,10
151150,A0A3P3ZC87,K00042,lbz:LBRM_30_0180,K00042,,LBRM_30_0180,LBZ,420245.0,2-hydroxy-3-oxopropionate reductase [EC:1.1.1.60],K00042,False,1248,True,10
151156,A0A3Q8IFE7,K00042,ldo:LDBPK_300170,K00042,,LDBPK_300170,LDO,5661.0,2-hydroxy-3-oxopropionate reductase [EC:1.1.1.60],K00042,False,1248,True,10
151620,A4HHW2,K00042,lbz:LBRM_30_0180,K00042,,LBRM_30_0180,LBZ,5660.0,2-hydroxy-3-oxopropionate reductase [EC:1.1.1.60],K00042,False,1248,True,10
151621,A4I512,K00042,lif:LINJ_30_0170,K00042,,LINJ_30_0170,LIF,5671.0,2-hydroxy-3-oxopropionate reductase [EC:1.1.1.60],K00042,False,1248,True,10
151775,C9ZQG6,K00042,tbg:TbgDal_VI1240,K00042,,TbgDal_VI1240,TBG,679716.0,2-hydroxy-3-oxopropionate reductase [EC:1.1.1.60],K00042,False,1248,True,10
151843,E9B0E2,K00042,lmi:LMXM_29_0180,K00042,,LMXM_29_0180,LMI,929439.0,2-hydroxy-3-oxopropionate reductase [EC:1.1.1.60],K00042,False,1248,True,10
152005,Q4DFE2,K00042,tcr:507017.40,K00042,,507017.40,TCR,353153.0,2-hydroxy-3-oxopropionate reductase [EC:1.1.1.60],K00042,False,1248,True,10
152006,Q4DQH3,K00042,tcr:505807.180,K00042,,505807.180,TCR,353153.0,2-hydroxy-3-oxopropionate reductase [EC:1.1.1.60],K00042,False,1248,True,10
152009,Q4Q7T9,K00042,lma:LMJF_30_0180,K00042,,LMJF_30_0180,LMA,5664.0,2-hydroxy-3-oxopropionate reductase [EC:1.1.1.60],K00042,False,1248,True,10


In [184]:
fasta_stats_out[fasta_stats_out["EUKCOUNT"].between(1,10)]

Unnamed: 0,KOGID,KOGSIZE,UIDS_MISSING,EUKCOUNT,UID_COUNT,%chosen
4,K00005,1619,807,4,812,0.492611
22,K00024,6526,2619,10,3907,0.255951
40,K00042,2632,1248,10,1384,0.722543
55,K00060,2408,1034,1,1374,0.072780
62,K00067,7109,2835,1,4274,0.023397
...,...,...,...,...,...,...
25713,K26382,1,0,1,1,100.000000
25714,K26383,7,2,5,5,100.000000
25890,K26560,13,4,9,9,100.000000
26140,K26810,23,20,3,3,100.000000


In [187]:
fasta_stats_out.to_csv("/data/luojaa/uid_stats/cluster_stats.csv", sep = ",", index = None)

## 5. verify contents of trembl and sprot

In [13]:
testpath = f"/data/luojaa/kegg/kog_uid/K00001.tsv"
base = "/data/luojaa/kegg/kog_uid/"
kog2uidpaths = [base + tsv for tsv in os.listdir(base)]
uid2kog = {} 
for path in kog2uidpaths:
    with open(path, "r") as f:
        for line in f:
            kog, uid, descriptor = line.strip().split("\t")
            uid2kog[uid] = kog

In [8]:
def extract_uid(instr):
    uid = instr.split("|")[1]
    return uid
def check_uid_coverage(infasta):
    uids =  set()
    with open(infasta, "r") as handle:
        for record in SeqIO.parse(handle, 'fasta'):
            identifier = record.id
            uid = extract_uid(identifier)
            uids.add(uid)
    return uids
sprot_set = check_uid_coverage("/data/luojaa/uniprot_sprot.fasta")

In [15]:
# uids_in_sprot = 0
# for uid in uid2kog.keys():
#     if uid in sprot_set:
#         uids_in_sprot += 1
# print(uids_in_sprot)

418394


In [16]:
len(sprot_set)

571282

In [14]:
len(uid2kog)

12462195