In [None]:
### Import packages
import pandas as pd
import numpy as np
from Utils import ss_utils as ss 
from Utils import ss_database as db

### 1. Pred_Proba for human Host

In [None]:
# Load Ready-to-go (RTG) data of RSCU, TAXONOMY and CDS_LENGTH
COVID_RSCU = pd.read_csv(f"RTG_Data//COVID19_US//RSCU.csv")
COVID_TAXONOMY = pd.read_csv(f"RTG_Data//COVID19_US//TAXONOMY.csv")
COVID_CDS_LENGTH = pd.read_csv(f"RTG_Data//COVID19_US//CDS_LENGTH.csv")

# Load trained recall-optimised models
Model = ss.database_load(f"Models//RF_model_RSCUpTAXONOMYpCDS_LENGTH_human_forRecall", 'Output')

# Prepare X
X = pd.concat([COVID_RSCU, COVID_TAXONOMY, COVID_CDS_LENGTH], axis=1)

# Calculate Pred_Proba
Pred_Proba = Model.predict_proba(X)
y_test = Model.predict(X)
y_true = [1] * X.shape[0]

# Organise into dataframe and save
DF_Pred_Proba = pd.DataFrame(Pred_Proba)
DF_Pred_Proba = DF_Pred_Proba.rename(columns={0:'not_human', 1:'human'})
DF_Pred_Proba['y_true'] = y_true
DF_Pred_Proba['y_test'] = y_test

ss.save_file(DF_Pred_Proba, 'COVID19//DF_Pred_Proba_RLT_Recall')


# Load trained BA-optimised models
Model = ss.database_load(f"Models//RF_model_RSCUpTAXONOMYpCDS_LENGTH_human_forBA", 'Output')

# Prepare X
X = pd.concat([COVID_RSCU, COVID_TAXONOMY, COVID_CDS_LENGTH], axis=1)

# Calculate Pred_Proba
Pred_Proba = Model.predict_proba(X)
y_test = Model.predict(X)
y_true = [1] * X.shape[0]

# Organise into dataframe and save
DF_Pred_Proba = pd.DataFrame(Pred_Proba)
DF_Pred_Proba = DF_Pred_Proba.rename(columns={0:'not_human', 1:'human'})
DF_Pred_Proba['y_true'] = y_true
DF_Pred_Proba['y_test'] = y_test

ss.save_file(DF_Pred_Proba, 'COVID19//DF_Pred_Proba_RLT_BA')

### 2. Pred_Proba for all the Host

In [None]:
# Load Ready-to-go (RTG) data of RSCU, TAXONOMY and CDS_LENGTH
COVID_RSCU = pd.read_csv(f"/RTG_Data//COVID19_US//RSCU.csv")
COVID_TAXONOMY = pd.read_csv(f"RTG_Data//COVID19_US//TAXONOMY.csv")
COVID_CDS_LENGTH = pd.read_csv(f"RTG_Data//COVID19_US//CDS_LENGTH.csv")


Dataset_List = ['RSCU', 'RSCUpTAXONOMYpCDS_LENGTH']
Optimised_Score_List = ['BA', 'Recall']
Host_List = ['human', 'vertebrates', 'invertebrates', 'land plants', 'bacteria']

for Dataset in Dataset_List:
    
    if Dataset == 'RSCU':
        X = pd.concat([COVID_RSCU], axis=1)
    else:
        X = pd.concat([COVID_RSCU, COVID_TAXONOMY, COVID_CDS_LENGTH], axis=1)
    
    for Optimised_Score in Optimised_Score_List:
        
        df_Pred_Proba = pd.DataFrame()
        
        for Host in Host_List:
            
            Model = ss.database_load(
                f"Models//{Dataset}_for{Optimised_Score}//RF_model_{Dataset}_{Host}", 
                'Output')

            Pred_Proba = Model.predict_proba(X)
            df_Pred_Proba_t = pd.DataFrame(Pred_Proba)
            
            df_Pred_Proba[Host] = df_Pred_Proba_t[1]
            
            Model = None
            
        ss.save_file(df_Pred_Proba, 
                     f"{Dataset}_COVID19_US_Pred_Proba_Hosts_for{Optimised_Score}")