### Import basic requirements

In [1]:
### Import basic requirements

# Public modules
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm as tb

# Self-made utils
from Utils import ss_utils as ss 
from Utils import ss_database as db


In [2]:
# Input major csv files to find out genomes with errors
ALL_VIRUS_info = pd.read_csv('Data//ALL_VIRUS_info.csv')
ALL_VIRUS_CODON_USAGE_sum = pd.read_csv('Data//ALL_VIRUS_CODON_USAGE_sum.csv')
ALL_VIRUS_CDS_error = pd.read_csv('Data//ALL_VIRUS_CDS_error.csv')

ALL_VIRUS_CODON_USAGE = pd.read_csv('Data//ALL_VIRUS_CODON_USAGE.csv')
ALL_VIRUS_AA_USAGE = pd.read_csv('Data//ALL_VIRUS_AA_USAGE.csv')
ALL_VIRUS_RSCU = pd.read_csv('Data//ALL_VIRUS_RSCU.csv')

ALL_VIRUS_hosts = pd.read_csv('Data//ALL_VIRUS_hosts.csv')
ALL_VIRUS_cds_length = pd.read_csv('Data//ALL_VIRUS_genome_length.csv')
ALL_VIRUS_ATGC = pd.read_csv('Data//ALL_VIRUS_ATGC.csv')
ALL_VIRUS_start_stop = pd.read_csv('Data//ALL_VIRUS_start_stop.csv')
ALL_VIRUS_Human_Corr = pd.read_csv('Data//ALL_VIRUS_Human_Corr.csv')
ALL_VIRUS_Human_Corr_AA = pd.read_csv('Data//ALL_VIRUS_Human_Corr_AA.csv')

# Verify the virus order is correct
ss.verify_id_col(ALL_VIRUS_info, 
                 ALL_VIRUS_CODON_USAGE_sum, 
                 ALL_VIRUS_CDS_error,
                 ALL_VIRUS_CODON_USAGE, 
                 ALL_VIRUS_AA_USAGE,
                 ALL_VIRUS_RSCU, 
                 ALL_VIRUS_hosts,
                 ALL_VIRUS_cds_length,
                 ALL_VIRUS_ATGC,
                 ALL_VIRUS_start_stop,
                 ALL_VIRUS_Human_Corr,
                 ALL_VIRUS_Human_Corr_AA)


Virus ID order is correct.


In [3]:
### Extract intereted virus genomes 

# Select complete virus genome - for consistent comparison of the whole virus genomes
# Select codon usage sum larger than 0.9 - discard genome sequences containing to much non-ATGC nucleotides
# Select genomes containing no CDS with length unable to be divided by 3 
ind_final = (
    ALL_VIRUS_info['RefSeq type'] == 'complete') & (   
    ALL_VIRUS_CODON_USAGE_sum['CU_sum'] >= 0.9) & (
    ALL_VIRUS_CDS_error['CDS_error'] == 'no'
    )


In [4]:
### Extract selected columns for later training models

# Dataframe of codon usage
df_cu = ALL_VIRUS_CODON_USAGE.loc[ind_final, db.CU_valid_codon_list]
df_cu = ss.rename_dataset_columns(df_cu, 'CU')

# Dataframe of amino acid usage
df_aau = ALL_VIRUS_AA_USAGE.loc[ind_final, db.AAU_valid_codon_list]
df_aau = ss.rename_dataset_columns(df_aau, 'AAU')

# Dataframe of RSCU
df_rscu = ALL_VIRUS_RSCU.loc[ind_final, db.RSCU_valid_codon_list]
df_rscu = ss.rename_dataset_columns(df_rscu, 'RSCU')

# Dataframe of id/acession information
df_info = ALL_VIRUS_info.loc[ind_final, ['id', 'Accession']]
df_info = df_info.reset_index(drop=True)

# Dataframe of virus host ranges
host_list = ALL_VIRUS_hosts.columns.tolist()
host_list.remove('id')
df_host = ALL_VIRUS_hosts.loc[ind_final, host_list]
df_host = df_host.reset_index(drop=True)

# Dataframe of monopartite or multipartite
df_partite_temp = ALL_VIRUS_info.loc[ind_final, ['Virus type', 'Number of segments']]
df_partite_temp.columns = ['partite', 'n_segments']
df_partite = ss.taxonomy_encoder(df_partite_temp[['partite']], 'Unclassified')
df_partite['n_segments'] = df_partite_temp['n_segments']

# Dataframe of CDS length
df_cds_len = ALL_VIRUS_cds_length.loc[ind_final, ['genome_length', 'contenCDS_length', 'cds_count', 'cds_length_mean', 'cds_length_std']]
df_cds_len = df_cds_len.reset_index(drop=True)

# Dataframe of ATGC 
df_atgc = ALL_VIRUS_ATGC.loc[ind_final, ['A%', 'T%', 'G%', 'C%', 'AT%', 'GC%']]
df_atgc = df_atgc.reset_index(drop=True)

# Dataframe of Start-stop codons 
df_stst = ALL_VIRUS_start_stop.loc[ind_final, ['ATG%', 'TAA%', 'TAG%', 'TGA%', 'human_start%', 'not_human_start%', 'human_stop%', 'not_human_stop%']]
df_stst = df_stst.reset_index(drop=True)

# Dataframe of correlation to human 
df_hs_corr = ALL_VIRUS_Human_Corr.loc[ind_final, ['homo_sapiens_CU_corr', 'homo_sapiens_AAU_corr', 'homo_sapiens_RSCU_corr']]
df_hs_corr = df_hs_corr.reset_index(drop=True)

# Dataframe of correlation to human amino acid
corr_aa_list = ALL_VIRUS_Human_Corr_AA.columns.tolist()
corr_aa_list.remove('id')
df_hs_corr_aa = ALL_VIRUS_Human_Corr_AA.loc[ind_final, corr_aa_list]
df_hs_corr_aa = df_hs_corr_aa.reset_index(drop=True)
df_hs_corr_aa = df_hs_corr_aa.fillna(0)


In [5]:
### Save all the RTG data
ss.save_file(df_cu, 'RTG_Data//ALL_VIRUS//CU')
ss.save_file(df_aau, 'RTG_Data//ALL_VIRUS//AAU')
ss.save_file(df_rscu, 'RTG_Data//ALL_VIRUS//RSCU')

ss.save_file(df_info, 'RTG_Data//ALL_VIRUS//INFO')
ss.save_file(df_host, 'RTG_Data//ALL_VIRUS//HOST')
ss.save_file(df_partite, 'RTG_Data//ALL_VIRUS//PARTITE')
ss.save_file(df_cds_len, 'RTG_Data//ALL_VIRUS//CDS_LENGTH')
ss.save_file(df_atgc, 'RTG_Data//ALL_VIRUS//ATGC')
ss.save_file(df_stst, 'RTG_Data//ALL_VIRUS//START_STOP')

ss.save_file(df_hs_corr, 'RTG_Data//ALL_VIRUS//HS_CORR')
ss.save_file(df_hs_corr_aa, 'RTG_Data//ALL_VIRUS//HS_CORR_AA')