In [None]:
### Import packages
import pandas as pd
import numpy as np
import multiprocessing
import tqdm
import time
from Utils import ss_utils as ss 
from Utils import ss_database as db

### 1. Download COVID-19 US data with NCBI Accession - And calculate codon biases

In [None]:
Previous_Data = False 

CPU_USED = 20

# Load data downloaded from NCBI
df_download = pd.read_csv('1. Downloaded raw data//NCBI_Virus_Screenshots//sequences.csv')
Index_Accession_List = list(enumerate(df_download['Accession']))

Run_List = list(range(len(Index_Accession_List)))

# Main program for downloading Genome Length
def PROCESS(RunList_Index):

    index, accession_id = Index_Accession_List[RunList_Index]

    all_cds_detail_dict = ss.get_all_cds_detail(accession_id)   # Download CDS details

    cds_len_list = []
    conten_cds = ''
    for gene_name, cds_detail_dict in all_cds_detail_dict.items():

        cds_seq = cds_detail_dict['mRNA_sequence']

        if len(cds_seq) % 3 == 0:
            cds_len_list.append(len(cds_seq))
            conten_cds += cds_seq   # Make contenCDS
        else:
            cds_len_list = []
            conten_cds = ''
            break

    # Codon Usage
    cu = ss.codon_usage(conten_cds)   # Calculate Codon Usages

    # Codon Usage Sum
    cu_sum = sum(list(cu.values()))  

    # RSCU
    rscu = ss.codon_usage_to_RSCU_usage(cu)   # Calculate RSCU

    # CDS Len
    dis_data = ss.discriptive_data(cds_len_list)
    cds_len_dict = {'contenCDS_length'   :dis_data['sum'],
                    'cds_count'          :dis_data['count'],
                    'cds_length_mean'    :dis_data['mean'],
                    'cds_length_std'     :dis_data['std'],
                    'cds_length_median'  :dis_data['median'],
                    'cds_length_variance':dis_data['variance']}
    
    return RunList_Index, cu, rscu, cds_len_dict, accession_id, cu_sum

# Create output folder
foldername = '1. Downloaded raw data//COVID19'
ss.create_folder(foldername)

if Previous_Data is False:
    Final_CU_Dict, Final_RSCU_Dict, Final_CDS_Len_Dict, Accession_ID_Dict, Final_CU_Sum_Dict = {}, {}, {}, {}, {}

# Multiprocessing program
POOL = multiprocessing.Pool(processes = CPU_USED)

for _RESULT_ in tqdm.tqdm(POOL.imap_unordered(PROCESS, Run_List), 
                          total = len(Run_List), miniters=50):
    
    RunList_Index, covid_cu_dict, covid_rscu_dict, covid_cds_len_dict, accession_id, cu_sum = _RESULT_
    
    Final_CU_Dict[RunList_Index] = covid_cu_dict
    Final_RSCU_Dict[RunList_Index] = covid_rscu_dict
    Final_CDS_Len_Dict[RunList_Index] = covid_cds_len_dict
    Accession_ID_Dict[RunList_Index] = accession_id
    Final_CU_Sum_Dict[RunList_Index] = cu_sum

    # Temporally Save data 
    if RunList_Index % 200 == 0:
        
        ss.database_save(Final_CU_Dict, f"{foldername}//Final_CU_Dict", 'Output')
        ss.database_save(Final_RSCU_Dict, f"{foldername}//Final_RSCU_Dict", 'Output')
        ss.database_save(Final_CDS_Len_Dict, f"{foldername}//Final_CDS_Len_Dict", 'Output')
        ss.database_save(Accession_ID_Dict, f"{foldername}//Accession_ID_Dict", 'Output')
        ss.database_save(Final_CU_Sum_Dict, f"{foldername}//CU_Sum_Dict", 'Output')

# Convert to dataframe and save data 
ss.database_save(Final_CU_Dict, f"{foldername}//Final_CU_Dict", 'Output')
ss.database_save(Final_RSCU_Dict, f"{foldername}//Final_RSCU_Dict", 'Output')
ss.database_save(Final_CDS_Len_Dict, f"{foldername}//Final_CDS_Len_Dict", 'Output')
ss.database_save(Accession_ID_Dict, f"{foldername}//Accession_ID_Dict", 'Output')
ss.database_save(Final_CU_Sum_Dict, f"{foldername}//CU_Sum_Dict", 'Output')
DF_Final_CU = pd.DataFrame.from_dict(Final_CU_Dict, 'index')
DF_Final_RSCU = pd.DataFrame.from_dict(Final_RSCU_Dict, 'index')
DF_Final_CDS_Len = pd.DataFrame.from_dict(Final_CDS_Len_Dict, 'index')
DF_Final_Accession_ID = pd.DataFrame.from_dict(Accession_ID_Dict, 'index')
DF_Final_CU_Sum = pd.DataFrame.from_dict(Final_CU_Sum_Dict, 'index')
ss.save_file(DF_Final_CU, f"{foldername}//DF_Final_CU")
ss.save_file(DF_Final_RSCU, f"{foldername}//DF_Final_RSCU")
ss.save_file(DF_Final_CDS_Len, f"{foldername}//DF_Final_CDS_Len")
ss.save_file(DF_Final_Accession_ID, f"{foldername}//DF_Final_Accession_ID")
ss.save_file(DF_Final_CU_Sum, f"{foldername}//DF_Final_CU_Sum")

print ('Done and save.')


### 2. Download Genome Length (required for CDS_Length dataset)

In [None]:

# Input previous downloaded data
Previous_Data = False
Final_Genome_Len = {}


COVID_Accession_ID = pd.read_csv(f"1. Downloaded raw data//COVID19//Final_Data//DF_Final_Accession_ID.csv")
Index_Accession_List = list(enumerate(COVID_Accession_ID['id']))
Run_List = list(range(len(Index_Accession_List)))

if Previous_Data is True:
    Run_List = Run_List[Index_Out:]
    print (f"Run_List rearranged at Index-{Index_Out}.")
else:
    pass

foldername = f"1. Downloaded raw data//COVID19"
CPU_USED = 20


# Main program for downloading Genome Length
def PROCESS(RunList_Index):
    index, accession_id = Index_Accession_List[RunList_Index]
    Download_Trial = 0
    while  Download_Trial < 3:
        try:
            seq = ss.get_whole_seq(accession_id)[accession_id]   # Download whole sequences
            break
        except:
            time.sleep(1)
            Download_Trial += 1
            seq = ''
            
    genome_length = len(seq)
    output = {'genome_length':genome_length}
    return RunList_Index, output

# Multiprocessing program
POOL = multiprocessing.Pool(processes = CPU_USED)
for _RESULT_ in tqdm.tqdm(POOL.imap_unordered(PROCESS, Run_List), 
                          total = len(Run_List), miniters=50):
    
    RunList_Index, output = _RESULT_
    Final_Genome_Len[RunList_Index] = output
    
    # Temporally Save data 
    if RunList_Index % 200 == 0:
        ss.database_save(Final_Genome_Len, f"{foldername}//Final_Genome_Len", 'Output')
        
# Convert to dataframe and save data 
ss.database_save(Final_Genome_Len, f"{foldername}//Final_Genome_Len", 'Output')
DF_Final_Genome_Len = pd.DataFrame.from_dict(Final_Genome_Len, 'index')
ss.save_file(DF_Final_Genome_Len, f"{foldername}//DF_Final_Genome_Len")
print ('Done and save.')

### 3. Organise into 'RTG_Data'

In [None]:

# Input downloaded final data
RSCU = ss.load_RTG_Data("RSCU")
TAXONOMY = ss.load_RTG_Data("TAXONOMY")
CDS_LENGTH = ss.load_RTG_Data("CDS_LENGTH")
INFO = ss.load_RTG_Data("INFO")

COVID_RSCU = pd.read_csv(f"1. Downloaded raw data//COVID19//Raw_Data//DF_Final_RSCU.csv")
COVID_CU_Sum = pd.read_csv(f"1. Downloaded raw data//COVID19//Raw_Data//DF_Final_CU_Sum.csv")
COVID_CU = pd.read_csv(f"1. Downloaded raw data//COVID19//Raw_Data//DF_Final_CU.csv")
COVID_CDS_Len = pd.read_csv(f"1. Downloaded raw data//COVID19//Raw_Data//DF_Final_CDS_Len.csv")
COVID_Accession_ID = pd.read_csv(f"1. Downloaded raw data//COVID19//Raw_Data//DF_Final_Accession_ID.csv")
COVID_Genome_Len = pd.read_csv(f"1. Downloaded raw data//COVID19/Raw_Data//DF_Final_Genome_Len.csv")


# Organise RSCU data to X-ready
COVID_RSCU = COVID_RSCU[db.multibox_codon_table_list]
ref_rscu_col = {}
for codon in db.multibox_codon_table_list:
    ref_rscu_col[codon] = f"[RSCU]_{codon}"
COVID_RSCU = COVID_RSCU.rename(columns=ref_rscu_col)
print (f"RSCU columns check = {COVID_RSCU.columns.tolist() == RSCU.columns.tolist()}")

# Organise Taxonomy data to X-ready
COVID_Name = 'Severe acute respiratory syndrome coronavirus 2'
COVID_Index = (INFO['id'] == COVID_Name)
COVID_TAXONOMY = TAXONOMY[COVID_Index].reset_index(drop=True)
COVID_TAXONOMY = COVID_TAXONOMY.loc[COVID_TAXONOMY.index.repeat(COVID_Accession_ID.shape[0])].reset_index(drop=True)
print (f"TAXONOMY columns check = {COVID_TAXONOMY.columns.tolist() == TAXONOMY.columns.tolist()}")

# Organise CDS_Len data to X-ready
CDS_LENGTH = ss.load_RTG_Data("CDS_LENGTH")
CDS_Len_col_list = CDS_LENGTH.columns.tolist()
COVID_CDS_Len = pd.concat([COVID_Genome_Len, COVID_CDS_Len[CDS_Len_col_list[1:]]], axis=1)
print (f"CDS_Len columns check = {COVID_CDS_Len.columns.tolist() == CDS_LENGTH.columns.tolist()}")


# Save into dataframes
foldername = '1. Downloaded raw data//COVID19//RTG_Data'
ss.create_folder(foldername)
ss.save_file(COVID_RSCU, f"{foldername}//RSCU")
ss.save_file(COVID_TAXONOMY, f"{foldername}//TAXONOMY")
ss.save_file(COVID_CDS_Len, f"{foldername}//CDS_LENGTH")

