### Import basic requirements

In [1]:
### Import basic requirements

# Public modules
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm as tb

# Self-made utils
from Utils import ss_utils as ss 
from Utils import ss_database as db


### 1. ALL_VIRUS_DATABASE.pickle to ALL_VIRUS_ALL_CDS_DETAIL.pickle

In [None]:
### 1. ALL_VIRUS_DATABASE.pickle to ALL_VIRUS_ALL_CDS_DETAIL.pickle

# Input ALL_VIRUS_DATABASE.pickle
ALL_VIRUS_DATABASE = ss.database_load('Data//ALL_VIRUS_DATABASE')

# Initiate variables
dict_t = {}
dict_split_list = {}
dict_final = {}
finish_list = []

virus_list = list(ALL_VIRUS_DATABASE.keys())
virus_list_run = ss.list_remove_list(virus_list,finish_list)

for i in virus_list_run:
    accession = ALL_VIRUS_DATABASE[i]['Accession']

    # Monopartite virus
    if isinstance(accession, str):
        all_cds_com = {}
        all_cds = ss.get_all_cds_detail(accession) 
        all_cds_com[accession] = all_cds
        dict_t[i] = all_cds_com
    
    # Multipartite virus
    elif isinstance(accession, list):
        all_cds_seg = {}
        for j in accession:
            all_cds_seg0 = ss.get_all_cds_detail(j)
            all_cds_seg1 = {j:all_cds_seg0}
            all_cds_seg = ss.merge_dict(all_cds_seg,all_cds_seg1)
        dict_t[i] = all_cds_seg

    dict_final = ss.merge_dict(dict_final, dict_t)
    dict_t = {}
    finish_list.append(i)

# Save 
ss.database_save(dict_final, 'Data//ALL_VIRUS_ALL_CDS_DETAIL')

In [None]:
### 1. check empty
dict_final = ss.database_load('Data//ALL_VIRUS_ALL_CDS_DETAIL')
empty_list = []

for i in dict_final:
    for j in dict_final[i]:
        if len(dict_final[i][j]) == 0:
            empty_list.append(i)

### 2. ALL_VIRUS_ALL_CDS_DETAIL.pickle to ALL_VIRUS_ALL_CDS.pickle

In [18]:
### 2. ALL_VIRUS_ALL_CDS_DETAIL.pickle to ALL_VIRUS_ALL_CDS.pickle

# Input ALL_VIRUS_ALL_CDS_DETAIL.pickle
ACD = ss.database_load('Data//ALL_VIRUS_ALL_CDS_DETAIL')
ACD = ss.dict_head(ACD, 3)

# Main
dict_out = {}
dict_virus = {}
dict_t = {}

for i in ACD:
    for j in ACD[i]:
        for k in ACD[i][j]:
            cds = ACD[i][j][k]['mRNA_sequence']
            dict_t[k] = cds
        dict_virus[j] = dict_t
        dict_t = {}
    dict_out[i] = dict_virus
    dict_virus = {}

# Save
ss.database_save(dict_out, 'Data//ALL_VIRUS_ALL_CDS')


### 3. ALL_VIRUS_ALL_CDS.pickle to ALL_VIRUS_ALL_CDS_SIMPLE.pickle

In [21]:
### 3. ALL_VIRUS_ALL_CDS.pickle to ALL_VIRUS_ALL_CDS_SIMPLE.pickle

# Input ALL_VIRUS_ALL_CDS,pickle
ALL_VIRUS_ALL_CDS = ss.database_load('Data//ALL_VIRUS_ALL_CDS')
ALL_VIRUS_ALL_CDS = ss.dict_head(ALL_VIRUS_ALL_CDS, 3)

# Main
OUTPUT = {}
for virus, accession_dict in ALL_VIRUS_ALL_CDS.items():
    cds_list = []
    for accession, cds_dict in accession_dict.items():
        for _, cds in cds_dict.items():
            cds_list.append(cds)
    OUTPUT[virus] = cds_list

# Save
ss.database_save(OUTPUT, 'Data//ALL_VIRUS_ALL_CDS_SIMPLE')

### 4. ALL_VIRUS_ALL_CDS_SIMPLE.pickle to ALL_VIRUS_CDS_Length.csv and ALL_VIRUS_CDS_error.csv

In [41]:
### 4. ALL_VIRUS_ALL_CDS_SIMPLE.pickle to ALL_VIRUS_CDS_Length.csv and ALL_VIRUS_CDS_error.csv

# Input ALL_VIRUS_ALL_CDS_SIMPLE.pickle
ALL_VIRUS_ALL_CDS_SIMPLE = ss.database_load('Data//ALL_VIRUS_ALL_CDS_SIMPLE')

# Main
dict_res = {}
dict_cds_error = {}
c = 0
for virus, cds_seq_list in ALL_VIRUS_ALL_CDS_SIMPLE.items():
    cds_len_list = []

    cds_error = 'no'
    for cds_seq in cds_seq_list:
        cds_len_list.append(len(cds_seq))

        # Find error sequences (Sequence length cannot divide by 3)
        if len(cds_seq) % 3 != 0:
            cds_error = 'yes'
            
    dis_data = ss.discriptive_data(cds_len_list)
    res_t = {'id'                 :virus,
             'contenCDS_length'   :dis_data['sum'],
             'cds_count'          :dis_data['count'],
             'cds_length_mean'    :dis_data['mean'],
             'cds_length_std'     :dis_data['std'],
             'cds_length_median'  :dis_data['median'],
             'cds_length_variance':dis_data['variance']}

    dict_res[c] = res_t
    error_t = {'id':virus,
               'CDS_error':cds_error}
    dict_cds_error[c] = error_t
    c += 1

# Turn into dataframe
df_res = pd.DataFrame.from_dict(dict_res, 'index')
df_cds_error = pd.DataFrame.from_dict(dict_cds_error, 'index')

# Add Genome length of virus
ALL_VIRUS_info = pd.read_csv('Data//ALL_VIRUS_info.csv')
df_res['genome_length'] = ALL_VIRUS_info['Genome length']

# Save
ss.save_file(df_res, 'Data//ALL_VIRUS_genome_length')
ss.save_file(df_cds_error, 'Data//ALL_VIRUS_CDS_error')

### 5. ALL_VIRUS_ALL_CDS_SIMPLE.pickle to ALL_VIRUS_start_stop.csv

In [23]:
### 5. ALL_VIRUS_ALL_CDS_SIMPLE.pickle to ALL_VIRUS_start_stop.csv

# Input ALL_VIRUS_ALL_CDS_SIMPLE.pickle
ALL_VIRUS_ALL_CDS_SIMPLE = ss.database_load('Data//ALL_VIRUS_ALL_CDS_SIMPLE')

# Main 
dict_res = {}
c = 0
for virus, cds_seq_list in ALL_VIRUS_ALL_CDS_SIMPLE.items():
    
    n_total_cds = len(cds_seq_list)
    res_t = {'id':virus, 'cds_number':n_total_cds}

    # Initiate variables
    dict_cal = {'ATG#':0, 'TAA#':0, 'TAG#':0, 'TGA#':0, 'human_start#':0, 'not_human_start#':0, 'human_stop#':0, 'not_human_stop#':0}
    not_human_start_list, not_human_stop_list = [], []

    for cds_seq in cds_seq_list:

        # Find start or stop codons of each cds 
        start = cds_seq[:3]
        stop = cds_seq[-3:]

        # Start codon
        if f"{start}#" in dict_cal:
            dict_cal[f"{start}#"] += 1
            dict_cal['human_start#'] += 1
        else:
            dict_cal['not_human_start#'] += 1
            not_human_start_list.append(start)

        # Stop codon 
        if f"{stop}#" in dict_cal:
            dict_cal[f"{stop}#"] += 1
            dict_cal['human_stop#'] += 1
        else:
            dict_cal['not_human_stop#'] += 1
            not_human_stop_list.append(stop)

        # not_human start stop codon list
        dict_cal['not_human_start_list'] = not_human_start_list
        dict_cal['not_human_stop_list'] = not_human_stop_list
    
    # Calculate percentages
    for key in ['ATG#', 'TAA#', 'TAG#', 'TGA#', 'human_start#', 'not_human_start#', 'human_stop#', 'not_human_stop#']:
        new_key = key.replace('#', '%')
        if n_total_cds > 0:
            dict_cal[new_key] = dict_cal[key] / n_total_cds
        else:
            dict_cal[new_key] = 0

    # Organise into final dictionary
    res_t.update(dict_cal)
    dict_res[c] = res_t
    c += 1
        
# Save as csv file 
df_res = pd.DataFrame.from_dict(dict_res, 'index')
ss.save_file(df_res, 'Data//ALL_VIRUS_start_stop')
