### Import basic requirements

In [2]:
### Import basic requirements

# Public modules
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm as tb

# Self-made utils
from Utils import ss_utils as ss 
from Utils import ss_database as db


### 1. ALL_VIRUS_DATABASE.pickle to ALL_VIRUS_DATABASE.csv and ALL_VIRUS_info.csv

In [6]:
### 1. ALL_VIRUS_DATABASE.pickle to ALL_VIRUS_DATABASE.csv and ALL_VIRUS_info.csv

# Input ALL_VIRUS_DATABASE.pickle
ALL_VIRUS_DATABASE = ss.database_load('Data//ALL_VIRUS_DATABASE')

# Orgainse information into dataframe
dict_res_raw = {}
dict_res_extra = {}
c = 0
for virus, info_dict in ALL_VIRUS_DATABASE.items():
    # Organise raw data
    res_t = {'id':virus}
    res_t.update(info_dict)
    dict_res_raw[c] = res_t

    # Organise to useful data
    res_t_extra = {'id':virus,
                   'Accession':info_dict['Accession'],
                   'RefSeq type':info_dict['RefSeq type']}
    # Monopartite virus
    if info_dict['Number of segments'] == '-':
        res_t_extra['Virus type'] = 'monopartite'
        res_t_extra['Segment'] = 'non-segmented'
        res_t_extra['Number of segments'] = 1
        res_t_extra['Genome length'] = int(info_dict['Genome length'].split(' nt')[0])
        try:
            res_t_extra['Number of proteins'] = int(info_dict['Number of proteins'])
        except:
            res_t_extra['Number of proteins'] = info_dict['Number of proteins']
    # Multipartite virus
    else:
        res_t_extra['Virus type'] = 'multipartite'
        res_t_extra['Segment'] = 'segmented'
        res_t_extra['Number of segments'] = int(info_dict['Number of segments'])
        res_t_extra['Genome length'] = int(info_dict['Genome length'][0].split(' nt')[0])
        try:
            res_t_extra['Number of proteins'] = int(info_dict['Number of proteins'][0])
        except:
            res_t_extra['Number of proteins'] = info_dict['Number of proteins'][0]

    dict_res_extra[c] = res_t_extra
    c += 1

# Save both data
df_res_raw = pd.DataFrame.from_dict(dict_res_raw, 'index')
ss.save_file(df_res_raw, 'Data//ALL_VIRUS_DATABASE')

df_res_extra = pd.DataFrame.from_dict(dict_res_extra, 'index')
ss.save_file(df_res_extra, 'Data//ALL_VIRUS_info')


### 2. ALL_VIRUS_DATABASE.pickle to ALL_VIRUS_HOST.csv

In [3]:
### 2. ALL_VIRUS_DATABASE.pickle to ALL_VIRUS_HOST.csv

# Load organised information from ALL_VIRUS_DATABASE.pickle
ALL_VIRUS_DATABASE = ss.database_load('Data//ALL_VIRUS_DATABASE')

# Main
host_total_list = []
dict_out = {}
for i in ALL_VIRUS_DATABASE:
    host_list = ALL_VIRUS_DATABASE[i]['Host']
    dict_out[i] = host_list
    for host in host_list:
        if host not in host_total_list:
            host_total_list.append(host)

# Checking empty host list
for i in dict_out:
    if len(dict_out[i]) == 0:
        print (i)
# No empty found

# Summarise the information into a dataframe
dict_res = {}
c = 0
for virus, host_list in dict_out.items():
    res_t = {'id':virus}
    for host in host_total_list:
        if host in host_list:
            res_t[host] = host
        else:
            res_t[host] = f"not_{host}"
    dict_res[c] = res_t
    c += 1

# Save csv file
df_res = pd.DataFrame.from_dict(dict_res, 'index')
ss.save_file(df_res, 'Data//ALL_VIRUS_hosts.csv')