### Import basic requirements

In [1]:
### Import basic requirements

# Public modules
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm as tb

# Self-made utils
from Utils import ss_utils as ss 
from Utils import ss_database as db


### 1. Search and Download NCBI Taxonomy IDs of viruses

In [2]:
### 1. Search and Download NCBI Taxonomy IDs of viruses

# Load virus names 
INFO = ss.load_RTG_Data('INFO')
id_dict = INFO['id'].to_dict()

# Define function for multi-threading
def PROCESS(ind)

    ID = id_dict[ind]

    # Search taxonomy IDs from NCBI according to virus name
    res = ss.taxonomy_search(ID)
    
    # Organise into output
    if len(res) == 1
        clean = res[0]
    else
        clean = ''
    output = {'ind'ind, 'id'ID, 'Taxonomy_id'res, 'clean_id'clean}

    return output

# Iteration list for multi-threading
RUN_LIST = list(id_dict.keys())

# Define multi-threading function
import concurrent.futures
from tqdm import tqdm
def run_in_threads(process, data, num_threads)
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor
        results = list(tqdm(executor.map(process, data), total=len(data)))
    return results

# Run multi-threading function to download NCBI Taxonomy IDs of viruses
c = 0
FINAL_RESULT = {}
for _RESULT_ in run_in_threads(PROCESS, RUN_LIST, num_threads=3)
    FINAL_RESULT[c] = _RESULT_
    c += 1

# Save the downloaded NCBI Taxonomy IDs of viruses
df_res = pd.DataFrame.from_dict(FINAL_RESULT, 'index')
ss.save_file(df_res, 'Data_Taxonomy//df_FINAL_RESULT')

# Save the WRONG downloaded NCBI Taxonomy IDs of viruses
df_wrong = df_res[df_res['clean_id'] == ''].reset_index(drop=True)
ss.save_file(df_wrong, 'Data_Taxonomy//df_wrong_FINAL_RESULT')

### 2. Manually search NCBI Taxonomy browser to correct the wrong ids

In [None]:
### 2. Manually search NCBI Taxonomy browser to correct the wrong ids

# NCBI Taxonomy browser : https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Tree&id=10239&lvl=3&keep=1&srchmode=1&unlock

# Save as 'df_wrong_FINAL_RESULT - Manually-search.csv'


### Summerising the q
df_man = ss.input_file('Data_Taxonomy//df_wrong_FINAL_RESULT - Manually-search')
man_ind_list = df_man['ind'].tolist()

df_added = ss.input_file('Data_Taxonomy//df_FINAL_RESULT')

for i in range(df_res.shape[0]):
    if i in man_ind_list:
        if df_added.loc[i, 'id'] == df_man[df_man['ind'] == i]['id'].tolist()[0] and df_added.loc[i, 'clean_id'] == '':
            df_added.loc[i, 'clean_id'] = df_man[df_man['ind'] == i]['manually_search_id'].tolist()[0]
        else:
            raise ValueError('Wrong')
    else:
        pass
    
df_added['clean_id'] = df_added['clean_id'].astype(int)

# Save the final id list
ss.save_file(df_added, 'Data_Taxonomy//df_Taxonomy_id')


### 3. Download Taxonomy Data from NCBI according to NCBI Taxonomy IDs

In [None]:
### 3. Download Taxonomy Data from NCBI according to NCBI Taxonomy IDs

# ID list
accession_id_list = df_added['clean_id'].tolist()
id_list = df_added['id'].tolist()

# Download Taxonomy data
res = {}
c = 0
for loc, ID in tb(enumerate(accession_id_list)):
    
    if c not in res:
        name = id_list[loc]
        
        retry_n = 1
        while True:
            try:
                tax_dict = ss.taxonomy_lineage_download(ID)
                break
            except:
                print(f"Retry: {retry_n} - {ID} - {name}")
                retry_n += 1
                time.sleep(3)

        res_t = {'id'       :name,
                 'accession':ID}
        res_t.update(tax_dict)

        res[c] = res_t
    c += 1
    
# Clean mistake of the downloaded data
res_clean = {}
for loc, res_dict in res.items():
    if 'clade' in res_dict and 'kingdom' not in res_dict:
        res_dict["clade-"] = res_dict.pop("clade")
    res_clean[loc] = res_dict
    
# Convert the data into dataframe
df_res = pd.DataFrame.from_dict(res_clean, 'index')
ss.save_file(df_res, 'Data_Taxonomy//df_taxonomy_raw')



### Organise the downloaded data and save
df_taxonomy_raw = ss.input_file('Data_Taxonomy//df_taxonomy_raw')

df_tax = df_taxonomy_raw[['id', 'superkingdom', 'clade', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus']]

for col in df_tax.columns:
    df_tax[col].fillna('Unclassified', inplace=True)

ss.save_file(df_tax, 'Data_Taxonomy//df_taxonomy')
ss.save_file(df_tax, 'RTG_Data//ALL_VIRUS//TAXONOMY_RAW')

# Confirm the ID are identical
INFO = ss.load_RTG_Data('INFO')
print (INFO['id'].tolist() == df_tax['id'].tolist())
# True

### 4. Encode Taxonomy data into Train-Ready data

In [None]:
### 4. Encode Taxonomy data into Train-Ready data

df_taxonomy = ss.input_file('Data_Taxonomy//df_taxonomy')

# Check the id
INFO = ss.load_RTG_Data('INFO')
print (INFO['id'].tolist() == df_taxonomy['id'].tolist())
# True

# Encode the Taxonomy data
df_tax = df_taxonomy[['clade', 'kingdom', 'phylum', 'class', 'order', 'family']]
df_out = ss.taxonomy_encoder(df_tax, 'Unclassified')

# Save data
ss.save_file(df_out, 'RTG_Data//ALL_VIRUS//TAXONOMY')