### Import basic requirements

In [1]:
### Import basic requirements

# Public modules
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm as tb

# Self-made utils
from Utils import ss_utils as ss 
from Utils import ss_database as db


### 1. Downloaded csv file from NCBI viral genome browser to ALL_VIRUS_DATABASE.pickle

In [None]:
### 1. Organise information into a general dictionary from the downloaded csv file

# Input csv file downloaded from NCBI genome browser
VGB = pd.read_csv('Data//Viral_genome_browser.csv', header=1)

# Initiate empty variables
dict_final_result = {}
error_list = []
family_list = []
global Family_name
global dict_out_single_virus
global dict_virus
Family_name = ''
dict_out_single_virus = {}
dict_virus = {}

for c in tb(range(VGB.shape[0])):

    # Convert df row to dict
    df_run_line = VGB.iloc[c]
    dict_run_line = df_run_line.to_dict()
    dict_out_single_virus = {}

    # generate determine variables for separating differnet rows
    dv_nan_num = list(dict_run_line.values()).count(np.nan)
    dv_genome_neighbors = dict_run_line['Genome'].count('neighbors:')
    dv_genome_proteins = dict_run_line['Genome'].count('proteins:')
    dv_accession = dict_run_line['Accession']

    # determine as complete virus genome
    if dv_nan_num < 3 and dv_genome_neighbors == 0 and dv_accession != '-':      
        dict_run_line['Genome'] = ss.del_head_space(dict_run_line['Genome'])
        virus_name = dict_run_line['Genome']
        print (virus_name + ' @ complete')
        dict_run_line['Host'] = str(dict_run_line['Host']).split(', ')
        dict_virus = dict_run_line.copy()
        dict_out_single_virus[virus_name] = dict_run_line

    # determine as segmented virus genome
    elif dv_nan_num < 3 and dv_genome_neighbors == 0 and dv_accession == '-':
        dict_run_line['Genome'] = ss.del_head_space(dict_run_line['Genome'])
        print (virus_name + ' @ segmented')
        virus_name = dict_run_line['Genome']
        dict_run_line['Accession'] = []
        dict_run_line['Genome length'] = [dict_run_line['Genome length']]
        dict_run_line['Number of proteins'] = [dict_run_line['Number of proteins']]
        dict_run_line['Genome Neighbors'] = [dict_run_line['Genome Neighbors']]
        dict_run_line['Host'] = str(dict_run_line['Host']).split(', ')
        dict_virus = dict_run_line.copy()
        dict_out_single_virus[virus_name] = dict_virus
        
    # determine as segments of segmented virus genome
    elif dv_nan_num > 8 and dv_genome_proteins|dv_genome_neighbors > 0 and dv_accession != '-':      
        info_list = ss.del_head_space(dict_run_line['Genome']).split('     ')
        dict_virus = dict_final_result[info_list[0]].copy()
        accession = str(info_list[2])
        dict_virus['Accession'].append(accession)
        dict_virus['Genome length'].append([accession,info_list[1]])
        if len(info_list) == 4:
            if info_list[3].count('proteins') > 0:
                dict_virus['Number of proteins'].append([accession,info_list[3].replace('proteins: ','')])
            else:
                dict_virus['Genome Neighbors'].append([accession,info_list[3].replace('neighbors: ','')])
        elif len(info_list) > 4:
            dict_virus['Number of proteins'].append([accession,info_list[3].replace('proteins: ','')])
            dict_virus['Genome Neighbors'].append([accession,info_list[4].replace('neighbors: ','')])
        dict_out_single_virus[info_list[0]] = dict_virus
    else:
        if dv_nan_num > 8 and dv_genome_neighbors == 0 and dv_genome_proteins == 0:          # Gathering family info
            family_list.append(ss.del_head_space(dict_run_line['Genome']))
        else:
            error_list.append(dict_run_line)

    # Organise into dataframe for non-empty sub-dictionary
    if len(dict_out_single_virus) != 0:
        dict_final_result = ss.merge_dict(dict_final_result, dict_out_single_virus)
    
# Save the data into 'ALL_VIRUS_DATABASE.pickle'
ss.database_save(dict_final_result, 'Data//ALL_VIRUS_DATABASE')


### 2. ALL_VIRUS_DATABASE.pickle to ALL_VIRUS_COMBINED_CDS.pickle

In [2]:
### 2. ALL_VIRUS_DATABASE.pickle to ALL_VIRUS_COMBINED_CDS.pickle

# Load organised information from ALL_VIRUS_DATABASE.pickle
VD = ss.database_load('Data//ALL_VIRUS_DATABASE')

# Initiate empty variables
dict_t = {}
dict_split_list = {}
dict_final = {}
finish_list = []

virus_list = list(VD.keys())
virus_list_run = ss.list_remove_list(virus_list,finish_list)

for i in tb(virus_list_run):
    accession = VD[i]['Accession']

    # Complete virus
    if type(accession) == str:
        combined_cds = ss.get_combined_cds(accession)[accession]
        dict_t[i] = combined_cds
    
    # Segmented virus
    elif type(accession) == list:
        dict_seg = {i:accession}
        combined_cds = ss.segmented_get_combined_cds(dict_seg)[i]
        dict_t[i] = combined_cds
        
    dict_final = ss.merge_dict(dict_final,dict_t)
    dict_t = {}
    finish_list.append(i)
    
    # Temporarily save every 100 viruses
    #if len(finish_list)/100 - round(len(finish_list)/100) == 0:
    #    ss.database_save(dict_final,'@@@ALL_VIRUS_COMBINED_CDS')
    #    ss.database_save(finish_list,'finish_list@@@ALL_VIRUS_COMBINED_CDS')

ss.database_save(dict_final,'Data//ALL_VIRUS_COMBINED_CDS')

### 3. ALL_VIRUS_COMBINED_CDS.pickle to ALL_VIRUS_CODON_USAGE.pickle

In [None]:
### 3. ALL_VIRUS_COMBINED_CDS.pickle to ALL_VIRUS_CODON_USAGE.pickle

# Load combined cds of viruses from ALL_VIRUS_COMBINED_CDS.pickle
VCC = ss.database_load('Data//ALL_VIRUS_COMBINED_CDS','System')

# Initiate empty variables
dict_final = {}
finish_list = []

virus_list = list(VCC.keys())
virus_list_run = ss.list_remove_list(virus_list, finish_list)

dict_t = {}
for i in tb(virus_list_run):
    cds_seq = VCC[i]
    if len(cds_seq) == 0:
        cu = db.empty_codon_usage.copy()
    else:
        cu = ss.codon_usage(cds_seq)
    dict_t[i] = cu

    dict_final = ss.merge_dict(dict_final,dict_t)
    dict_t = {}
    finish_list.append(i)
    
    # Temporarily save every 100 viruses
    #if len(finish_list)/100 - round(len(finish_list)/100) == 0:
    #    ss.database_save(dict_final,'@@@ALL_VIRUS_COMBINED_CDS','Output')
    #    ss.database_save(finish_list,'finish_list@@@ALL_VIRUS_COMBINED_CDS','Output')

ss.database_save(dict_final, 'Data//ALL_VIRUS_CODON_USAGE')



In [None]:
### 3. Organise into dataframe

ALL_VIRUS_CODON_USAGE = ss.database_load('Data//ALL_VIRUS_CODON_USAGE')

dict_cu = {}
c = 0 
for virus, cu in tb(ALL_VIRUS_CODON_USAGE.items()):
    dict_t = {'id':virus}
    dict_t.update(cu)
    dict_cu[c] = dict_t
    c += 1

df_cu = pd.DataFrame.from_dict(dict_cu, 'index')
ss.save_file(df_cu, 'Data//ALL_VIRUS_CODON_USAGE')


In [None]:
### 3. Calculate sum of codon usage - to later find out geneomes with non-ATGC nt

ALL_VIRUS_CODON_USAGE = pd.read_csv('Data//ALL_VIRUS_CODON_USAGE.csv')
df_res = pd.DataFrame()
df_res['id'] = ALL_VIRUS_CODON_USAGE['id']

df_res['CU_sum'] = ALL_VIRUS_CODON_USAGE[db.codon_table_list].sum(axis=1)
ss.save_file(df_res, 'Data//ALL_VIRUS_CODON_USAGE_sum')

### 4. ALL_VIRUS_COMBINED_CDS.pickle to ALL_VIRUS_AA_USAGE.pickle

In [None]:
### 4. ALL_VIRUS_COMBINED_CDS.pickle to ALL_VIRUS_AA_USAGE.pickle

# Load ALL_VIRUS_COMBINED_CDS.pickle
ALL_VIRUS_COMBINED_CDS = ss.database_load('Data//ALL_VIRUS_COMBINED_CDS')

# Main
dict_final = {}
for virus in ALL_VIRUS_COMBINED_CDS:
    cds = ALL_VIRUS_COMBINED_CDS[virus]
    aa = ss.aa_usage(cds)
    dict_final[virus] = aa

ss.database_save(dict_final, 'Data//ALL_VIRUS_AA_USAGE')

In [None]:
### 4. Organise into dataframe
ALL_VIRUS_AA_USAGE = ss.database_load('Data//ALL_VIRUS_AA_USAGE')

dict_aau = {}
c = 0 
for virus, aau in tb(ALL_VIRUS_AA_USAGE.items()):
    dict_t = {'id':virus}
    dict_t.update(aau)
    dict_aau[c] = dict_t
    c += 1

df_aau = pd.DataFrame.from_dict(dict_aau, 'index')
ss.save_file(df_aau, 'Data//ALL_VIRUS_AA_USAGE')

### 5. ALL_VIRUS_CODON_USAGE.pickle to ALL_VIRUS_RSCU.pickle

In [None]:
### 5. ALL_VIRUS_CODON_USAGE.pickle to ALL_VIRUS_RSCU.pickle

# Load CU (codon usage) of viruses from ALL_VIRUS_CODON_USAGE.pickle
ALL_VIRUS_CODON_USAGE = ss.database_load('Data//ALL_VIRUS_CODON_USAGE')

# Main 
virus_list = list(ALL_VIRUS_CODON_USAGE.keys())
dict_final = {}
for i in tb(virus_list):
    cu = ALL_VIRUS_CODON_USAGE[i]
    rscu = ss.codon_usage_to_RSCU_usage(cu)
    dict_final[i] = rscu

ss.database_save(dict_final, 'Data//ALL_VIRUS_RSCU')

In [None]:
### 5. Organise into dataframe

ALL_VIRUS_RSCU = ss.database_load('Data//ALL_VIRUS_RSCU')

dict_rscu = {}
c = 0 
for virus, rscu in tb(ALL_VIRUS_RSCU.items()):
    dict_t = {'id':virus}
    dict_t.update(rscu)
    dict_rscu[c] = dict_t
    c += 1

df_rscu = pd.DataFrame.from_dict(dict_rscu, 'index')
ss.save_file(df_rscu, 'Data//ALL_VIRUS_RSCU')

### 6. ALL_VIRUS_COMBINED_CDS.pickle to ALL_VIRUS_ATGC.csv

In [None]:
### 6. ALL_VIRUS_COMBINED_CDS.pickle to ALL_VIRUS_ATGC.csv

# Load combined cds of viruses from ALL_VIRUS_COMBINED_CDS.pickle
ALL_VIRUS_COMBINED_CDS = ss.database_load('Data//ALL_VIRUS_COMBINED_CDS')

# Main
dict_res = {}
c = 0
for virus in ALL_VIRUS_COMBINED_CDS:
    seq = ALL_VIRUS_COMBINED_CDS[virus]
    res_t = {'id':virus}
    res_t.update(ss.ATGC_cal(seq))
    dict_res[c] = res_t
    c += 1

# Turn into dataframe and save as csv file
df_res = pd.DataFrame.from_dict(dict_res, 'index')
ss.save_file(df_res, 'Data//ALL_VIRUS_ATGC')


### 7. ALL_VIRUS_CODON_USAGE.csv / ALL_VIRUS_RSCU.csv / ALL_VIRUS_AA_USAGE.csv to ALL_VIRUS_Human_Corr.csv

In [None]:
### 7. ALL_VIRUS_CODON_USAGE.csv / ALL_VIRUS_AA_USAGE.csv / ALL_VIRUS_RSCU.csv to ALL_VIRUS_Human_Corr.csv

# Input CU, AAU, RSCU of virus
ALL_VIRUS_CODON_USAGE = ss.database_load('Data//ALL_VIRUS_CODON_USAGE')
ALL_VIRUS_AA_USAGE = ss.database_load('Data//ALL_VIRUS_AA_USAGE')
ALL_VIRUS_RSCU = ss.database_load('Data//ALL_VIRUS_RSCU')

# Input ALL_VIRUS_CODON_USAGE.csv for final virus order check
ALL_VIRUS_CODON_USAGE_csv = pd.read_csv('Data//ALL_VIRUS_CODON_USAGE.csv')

# Input CU, AAU, RSCU of homo sapiens
homo_sapiens_codon_usage = db.homo_sapiens_codon_usage
homo_sapiens_aa_usage = db.homo_sapiens_aa_usage
homo_sapiens_RSCU = db.homo_sapiens_RSCU

hs_cu = ss.dict_extract_list(homo_sapiens_codon_usage, db.CU_valid_codon_list)
hs_aau = ss.dict_extract_list(homo_sapiens_aa_usage, db.AAU_valid_codon_list)
hs_rscu = ss.dict_extract_list(homo_sapiens_RSCU, db.RSCU_valid_codon_list)
hs_cu = list(hs_cu.values())
hs_aau = list(hs_aau.values())
hs_rscu = list(hs_rscu.values())

# Main
dict_res = {}
c = 0
for virus in tb(ALL_VIRUS_CODON_USAGE):

    # Get virus cu dict, aau dict, and rscu dict
    virus_cu = ALL_VIRUS_CODON_USAGE[virus]
    virus_aau = ALL_VIRUS_AA_USAGE[virus]
    virus_rscu = ALL_VIRUS_RSCU[virus]

    virus_cu = ss.dict_extract_list(virus_cu, db.CU_valid_codon_list)
    virus_aau = ss.dict_extract_list(virus_aau, db.AAU_valid_codon_list)
    virus_rscu = ss.dict_extract_list(virus_rscu, db.RSCU_valid_codon_list)
    virus_cu = list(virus_cu.values())
    virus_aau = list(virus_aau.values())
    virus_rscu = list(virus_rscu.values())

    cu_corr = np.corrcoef(virus_cu, hs_cu)[1,0]
    aau_corr = np.corrcoef(virus_aau, hs_aau)[1,0]
    rscu_corr = np.corrcoef(virus_rscu, hs_rscu)[1,0]

    # Organise results into final dictionary
    res_t = {'id': virus,
             'homo_sapiens_CU_corr':cu_corr,
             'homo_sapiens_AAU_corr':aau_corr,
             'homo_sapiens_RSCU_corr':rscu_corr}
    dict_res[c] = res_t
    c += 1

# Convert into dataframe
df_res = pd.DataFrame.from_dict(dict_res, 'index')

# Check output virus order
if df_res['id'].tolist() == ALL_VIRUS_CODON_USAGE_csv['id'].tolist():
    print ('Virus order correct.')
else:
    raise ValueError('Virus order wrong.')

# Save as csv file
ss.save_file(df_res, 'Data//ALL_VIRUS_Human_Corr')

### 8. ALL_VIRUS_CODON_USAGE.csv / ALL_VIRUS_RSCU.csv / ALL_VIRUS_AA_USAGE.csv to ALL_VIRUS_Human_Corr_AA.csv

In [24]:
### 8. ALL_VIRUS_CODON_USAGE.csv / ALL_VIRUS_RSCU.csv / ALL_VIRUS_AA_USAGE.csv to ALL_VIRUS_Human_Corr_AA.csv

# Input CU, RSCU of virus
ALL_VIRUS_CODON_USAGE = ss.database_load('Data//ALL_VIRUS_CODON_USAGE')
ALL_VIRUS_RSCU = ss.database_load('Data//ALL_VIRUS_RSCU')

# Input ALL_VIRUS_CODON_USAGE.csv for final virus order check
ALL_VIRUS_CODON_USAGE_csv = pd.read_csv('Data//ALL_VIRUS_CODON_USAGE.csv')

# Input CU, AAU, RSCU of homo sapiens
homo_sapiens_codon_usage = db.homo_sapiens_codon_usage
homo_sapiens_RSCU = db.homo_sapiens_RSCU


# Main
dict_res = {}
c = 0
for virus in tb(ALL_VIRUS_CODON_USAGE):

    # Get virus cu dict and rscu dict
    virus_cu = ALL_VIRUS_CODON_USAGE[virus]
    virus_rscu = ALL_VIRUS_RSCU[virus]

    res_t = {'id':virus}
    res_cu_t = {}
    res_rscu_t = {}
    for aa in db.aa_list_with_2plus_codons:

        # Get codon list for each aa
        codon_list = db.translation_aa_codon_dict[aa]

        # Get cu values and rscu values
        hs_cu_aa = ss.dict_extract_list(homo_sapiens_codon_usage, codon_list)
        hs_rscu_aa = ss.dict_extract_list(homo_sapiens_RSCU, codon_list)

        virus_cu_aa = ss.dict_extract_list(virus_cu, codon_list)
        virus_rscu_aa = ss.dict_extract_list(virus_rscu, codon_list)

        hs_cu_aa = list(hs_cu_aa.values())
        hs_rscu_aa = list(hs_rscu_aa.values())
        virus_cu_aa = list(virus_cu_aa.values())
        virus_rscu_aa = list(virus_rscu_aa.values())

        # Calculate Pearson correlation coefficient
        cu_aa_corr = np.corrcoef(virus_cu_aa, hs_cu_aa)[1,0]
        rscu_aa_corr = np.corrcoef(virus_rscu_aa, hs_rscu_aa)[1,0]

        # Organise into dictionary
        res_cu_t[f"homo_sapiens_CU_{aa}"] = cu_aa_corr
        res_rscu_t[f"homo_sapiens_RSCU_{aa}"] = rscu_aa_corr
    
    # Organise into final dictionary
    res_t.update(res_cu_t)
    res_t.update(res_rscu_t)
    dict_res[c] = res_t
    c += 1

# Convert into dataframe
df_res = pd.DataFrame.from_dict(dict_res, 'index')

# Check output virus order
if df_res['id'].tolist() == ALL_VIRUS_CODON_USAGE_csv['id'].tolist():
    print ('Virus order correct.')
else:
    raise ValueError('Virus order wrong.')

# Save as csv file
ss.save_file(df_res, 'Data//ALL_VIRUS_Human_Corr_AA')
