In [48]:
import ete3
import pandas as pd
import numpy as np

In [19]:
ncbi = ete3.NCBITaxa()

In [3]:
uid_taxid = "/data/luojaa/uid_stats/uid2taxids.combined.tsv"

In [4]:

taxid_df = pd.read_csv(uid_taxid, sep="\t", header = None).rename(columns = {0:"UID", 1:"taxid"})
taxids = taxid_df["taxid"]
lineages = ncbi.get_lineage_translator(taxids)


In [15]:
# # copy pasted taxid_tmp from google sheet
# taxid_edges = [int(i) for i in taxid_tmp]
# names_tmp = ncbi.get_taxid_translator(taxid_edges)
# for taxid in taxid_edges:
#     print(names_tmp[taxid])

In [81]:
def get_superkingdom(uid):
    try:
        lineage = lineages[uid]
    except:
        try:
            lineage = ncbi.get_lineage(uid)
        except ValueError:
            print(f"uid {uid} was not found nor translated")
            return np.nan
    if 2 in lineage:
        return "Bacteria"
    elif 2759 in lineage:
        return "Eukaryota"
    elif 2157 in lineage:
        return "Archaea"
    elif 10239 in lineage:
        return "Viruses"
    else:
        print(f"uid {uid} was not mapped")

In [None]:
superkingdoms = taxid_df["taxid"].apply(get_superkingdom)
taxid_df["superkingdom"] = superkingdoms
taxid_df.fillna("None")

In [108]:
def rank_to_lineage(lineage2rank):
    return dict((rank, taxid) for (taxid, rank) in lineage2rank.items())

In [None]:
uid2_rank2lineage = {}
for uid in list(set(taxids)):
    try:
        lineage = lineages[uid]
    except:
        try:
            lineage = ncbi.get_lineage(uid)
        except ValueError:
            print(f"uid {uid} was not found nor translated")
            continue
    lineage2rank = ncbi.get_rank(lineage)
    rank2lineage = rank_to_lineage(lineage2rank)
    uid2_rank2lineage[uid] = rank2lineage

In [241]:
uids_nokingdom = list(set(taxid_df[taxid_df["superkingdom"] == "None"]["taxid"]))
uids_viruses= list(set(taxid_df[taxid_df["superkingdom"] == "Viruses"]["taxid"]))

In [312]:
no_classes = pd.read_csv("/data/luojaa/taxids/class_edge_handling.tsv", sep = "\t")
no_classes

Unnamed: 0,taxid,org name,Proposed class label
0,1238993,Mycoplasmoides pneumoniae M129-B7,Mycoplasmatota
1,1637999,Verrucomicrobia bacterium IMCC26134,Verrucomicrobiae
2,1779382,Rhodothermaceae bacterium RA,Rhodothermota
3,1540872,Candidatus Gracilibacteria bacterium HOT-871,incertae sedis
4,1911684,Tenericutes bacterium MO-XQ,Mycoplasmatota
...,...,...,...
263,1178016,"1,2759,4751,6029,6032,6033,33154,36734,112252,...",Microsporidia
264,1538547,"1,2157,131567,1538547,1655434,1655637,1935183,...",Asgard
265,1899017,"1,2,131567,1783272,1798710,1798711,1897007,189...",incertae sedis
266,2259672,"1,2157,131567,651137,651142,1783275,1825023,22...",Nitrososphaeria


In [313]:
taxids_edge = list(no_classes["taxid"])
class_edge = list(no_classes["Proposed class label"])
taxidsedge2label = {}
for i in range(len(taxids_edge)):
    taxid, classlabel = taxids_edge[i], class_edge[i]
    taxidsedge2label[taxid] = classlabel


In [336]:
uid2classid = {}
with open("/data/luojaa/log/translate_uids.log", "w") as logfile:
    for uid in list(set(taxids)):
        try:
            rank2lineage = uid2_rank2lineage[uid]
        except:
            continue
        try:
            classid = rank2lineage["class"]
            uid2classid[uid] = classid
        except:
            try:
                classname = taxidsedge2label[uid]
                classid = ncbi.get_name_translator([classname])[classname][0]
                uid2classid[uid] = classid
            except:
                lineage_dict = ncbi.get_taxid_translator(ncbi.get_lineage(uid))
                if (uid not in uids_nokingdom) and (uid not in uids_viruses):
                    print("\t".join([str(uid)] + [",".join([str(taxid) for taxid in list(lineage_dict.keys())] + list(lineage_dict.values()) + list(uid2_rank2lineage[uid].keys()))]), file=logfile)
            




In [315]:
classids2class = ncbi.get_taxid_translator(list(set(uid2classid.values())))


In [318]:
len(uid2_rank2lineage), len(classids2class), len(uid2classid), len(uid2class)

(10762, 236, 10651, 10559)

In [351]:
uid2class = {}
for uid in list(set(taxids)):
    try:
        classid = uid2classid[uid]
        classname = classids2class[classid]
        uid2class[uid] = classname
    except:
        continue

In [348]:
def get_class(uid):
    try:
        classname = uid2class[uid]
        return classname
    except:
        try:
            classname = taxidsedge2label[uid]
            return classname
        except:
            return "None"



In [349]:
classes = taxid_df["taxid"].apply(get_class)
taxid_df["class"] = classes

In [322]:
taxid_df[taxid_df["class"].str.contains("Melaina")]

Unnamed: 0,UID,taxid,superkingdom,class


In [323]:
taxid_df

Unnamed: 0,UID,taxid,superkingdom,class
0,A0AA96VA13,3028294,Archaea,Methanomicrobia
1,Q46FD0,269797,Archaea,Methanomicrobia
2,Q469K8,269797,Archaea,Methanomicrobia
3,Q46FN1,269797,Archaea,Methanomicrobia
4,A6VJ99,426368,Archaea,Methanococci
...,...,...,...,...
12462199,Q27YE6,3052325,Viruses,Ellioviricetes
12462200,B0BLK7,3052322,Viruses,Ellioviricetes
12462201,B2MW50,3052323,Viruses,Ellioviricetes
12462202,Q6RSS3,3052324,Viruses,Ellioviricetes


In [362]:
taxid_df.to_csv("/data/luojaa/eukgen/mmseqs/taxids_classlabels.tsv", sep = "\t", header = None, index = None)

### new class labels (cluster or delete stuff)

In [4]:
# same as taxid_df / taxids_classlabels.tsv
oldtaxids = pd.read_csv("/data/luojaa/taxids/kegg_old_classes.mcrcsm.tsv", sep = "\t", header = None)
oldtaxids

Unnamed: 0,0,1,2,3
0,A0AA96VA13,3028294,Archaea,Methanomicrobia
1,Q46FD0,269797,Archaea,Methanomicrobia
2,Q469K8,269797,Archaea,Methanomicrobia
3,Q46FN1,269797,Archaea,Methanomicrobia
4,A6VJ99,426368,Archaea,Methanococci
...,...,...,...,...
12462199,Q27YE6,3052325,Viruses,Ellioviricetes
12462200,B0BLK7,3052322,Viruses,Ellioviricetes
12462201,B2MW50,3052323,Viruses,Ellioviricetes
12462202,Q6RSS3,3052324,Viruses,Ellioviricetes


In [6]:
mapping_file = pd.read_csv("/data/luojaa/taxids/updated_taxid_mapping.tsv", sep = "\t")
classmapping = dict(zip(mapping_file.ncbi_class, mapping_file.new_class))

In [33]:
def remap(oldclass):
    try:
        return classmapping[oldclass]
    except:
        return oldclass

In [34]:
newclasses = oldtaxids[3].apply(lambda x: remap(x))

In [35]:
"Myxococcia" in list(oldtaxids[3])

True

In [46]:
"Myxococcota" in list(newtaxids["new_class"])

True

In [50]:
classmapping["Acidimicrobiia"]

'Actinomycetota'

In [57]:
oldtaxids[oldtaxids[3] == oldtaxids["new_class"]][3].unique()

array(['Thermoplasmata', 'Asgard', 'Methanonatronarchaeia', 'Nanobdellia',
       'Microsporidia', 'Leotiomycetes', 'Pucciniomycetes',
       'Heterolobosea', 'Malacostraca', 'Enoplea', 'Enteropneusta',
       'Merostomata', 'Asteroidea', 'Choanoflagellata', 'Metamonada',
       'Diplopoda', 'Caudoviricetes', 'Leviviricetes', 'Faserviricetes',
       'Tectiliviricetes', 'Megaviricetes', 'Vidaverviricetes',
       'Oomycota', 'Cryptophyceae', 'Gammaproteobacteria',
       'Alphaproteobacteria', 'Chlamydiia', 'Betaproteobacteria',
       'Mycoplasmatota', 'Thermotogae', 'Thermodesulfobiia',
       'Deltaproteobacteria', 'Candidatus Bipolaricaulia', 'Nitrospinia',
       'Thermosulfidibacteria', 'Armatimonadia', 'Candidatus Sumerlaeia',
       'Candidatus Peribacteria', 'Cyanobacteriota', 'Chthonomonadetes',
       'Candidatus Uabimicrobiia', 'Papovaviricetes', 'Repensiviricetes',
       'Naldaviricetes', 'Pokkesviricetes', 'Herviviricetes',
       'Revtraviricetes', 'Ellioviricetes', 'Al

In [52]:
oldtaxids["new_class"] = newclasses
newtaxids = oldtaxids.loc[:, [0, 1, 2, "new_class"]]

In [47]:
newtaxids.to_csv("/data/luojaa/taxids/kegg_new_classes.mcrcsm.tsv", sep = "\t", header = None, index = None)

In [42]:
newtaxids

Unnamed: 0,0,1,2,new_class
0,A0AA96VA13,3028294,Archaea,Euryarchaeota
1,Q46FD0,269797,Archaea,Euryarchaeota
2,Q469K8,269797,Archaea,Euryarchaeota
3,Q46FN1,269797,Archaea,Euryarchaeota
4,A6VJ99,426368,Archaea,Euryarchaeota
...,...,...,...,...
12462199,Q27YE6,3052325,Viruses,Ellioviricetes
12462200,B0BLK7,3052322,Viruses,Ellioviricetes
12462201,B2MW50,3052323,Viruses,Ellioviricetes
12462202,Q6RSS3,3052324,Viruses,Ellioviricetes
