# Predicting COVID-19 Inhibitors

In [1]:
!python -m pip install scikit-learn==0.24.0



You should consider upgrading via the 'C:\Users\shkev\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip' command.


In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
from joblib import load, dump
from tqdm.notebook import tqdm

from numpy.random import seed
seed(0)
from tensorflow.random import set_seed
set_seed(0)

## Loading Data

In [2]:
# reading in FASTA files containing the Amino Acid sequences
def read_fasta(file_names):
    """
    Reads a FASTA files and returns a pd DataFrame with the protein names and Amino Acid sequences
    # Params
    file_names: list of fasta file names to read
    
    # Returns
    df: the DataFrame containing the names and sequences from all inputted files
    """
    df = pd.DataFrame()
    sequences = []
    ids = []
    nsp = []
    for file_name in file_names:
        seq = ""
        with open(file_name, "r") as f:
            for line in f:
                # if its reading a comment line extract the name of the target
                if line[0] == '>':
                    sequences.append(seq.replace('\n', ''))
                    seq = ""
                    info = line[1:].split(' ')
                    ids.append(info[0])
                    nsp.append(info[1])
                # if its not on a comment line extract the sequence
                else:
                    seq += line
            # adding the last sequence
            sequences.append(seq)
            # removing empty strings added to list
            sequences.remove("")
    df['NCBI ID'] = ids
    df['nsp'] = nsp
    df['sequence'] = sequences
    return df

In [3]:
covid_seqs = read_fasta(['./drug_target_data/covid19_data/covid19_nsps.fasta'])
covid_seqs.head()

Unnamed: 0,NCBI ID,nsp,sequence
0,YP_009725297.1,nsp1,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...
1,YP_009725298.1,nsp2,AYTRYVDNNFCGPDGYPLECIKDLLARAGKASCTLSEQLDFIDTKR...
2,YP_009725299.1,nsp3,APTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVEL...
3,YP_009725300.1,nsp4,KIVNNWLKQLIKVTLVFLFVAAIFYLITPVHVMSKHTDFSSEIIGY...
4,YP_009725301.1,nsp5,SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTS...


## Protein Features

### Vectorizing Sequences

In [4]:
acc_vec = load('./models/acc_vectorizer.joblib')
acc = acc_vec.transform(covid_seqs['sequence'])

In [5]:
acc

<16x22 sparse matrix of type '<class 'numpy.int64'>'
	with 305 stored elements in Compressed Sparse Row format>

In [6]:
dc_vec = load('./models/dc_vectorizer.joblib')
dc = dc_vec.transform(covid_seqs['sequence'])
dc

<16x441 sparse matrix of type '<class 'numpy.int64'>'
	with 3038 stored elements in Compressed Sparse Row format>

In [7]:
tc_vec = load('./models/tc_vectorizer.joblib')
tc = tc_vec.transform(covid_seqs['sequence'])
tc

<16x8089 sparse matrix of type '<class 'numpy.int64'>'
	with 6406 stored elements in Compressed Sparse Row format>

In [8]:
covid_prot_features = sparse.hstack([acc, dc, tc])

In [9]:
covid_prot_features

<16x8552 sparse matrix of type '<class 'numpy.int64'>'
	with 9749 stored elements in COOrdinate format>

In [10]:
prot_var = load('./models/prot_var.joblib')
covid_prot_features = prot_var.transform(covid_prot_features)
covid_prot_features

<16x8043 sparse matrix of type '<class 'numpy.int64'>'
	with 9700 stored elements in Compressed Sparse Row format>

In [11]:
num_covid_prot = covid_seqs.shape[0]

### Domain Data

In [12]:
# reading all covid domain info files
covid_domain_df = pd.read_csv('./drug_target_data/covid19_data/covid19_nsps_domains.tsv', sep='\t')

covid_domain_df.rename(columns={'Accession':'domains'}, inplace=True)

# removing all info except NCBI id from first column
covid_domain_df = covid_domain_df.rename(columns={'Query':'NCBI ID'})
covid_domain_df['NCBI ID'] = covid_domain_df['NCBI ID'].apply(lambda x: x.split()[2][1:])

covid_domain_df.head()

Unnamed: 0,NCBI ID,Hit type,PSSM-ID,From,To,E-Value,Bitscore,domains,Short name,Incomplete,Superfamily
0,YP_009725297.1,non-specific,288369,13,127,2.59137e-71,211.067,pfam11501,Nsp1,-,cl13018
1,YP_009725297.1,superfamily,288369,13,127,2.59137e-71,211.067,cl13018,Nsp1 superfamily,-,-
2,YP_009725298.1,specific,394867,2,638,0.0,1142.58,cd21516,cv_beta_Nsp2_SARS-like,-,-
3,YP_009725298.1,non-specific,394866,3,603,2.19831e-174,508.335,cd21515,cv_beta_Nsp2_SARS_MHV-like,-,-
4,YP_009725298.1,non-specific,394864,3,470,1.31198e-97,304.519,cd21511,cv-alpha_beta_Nsp2-like,-,-


In [13]:
# reading domain info for original proteins
domain_df = pd.read_csv('./drug_target_data/protein_domains/drugbank_target_domains1.tsv', sep='\t')
domain_df = domain_df.append(pd.read_csv('./drug_target_data/protein_domains/drugbank_target_domains2.tsv', sep='\t'), ignore_index=True)
domain_df = domain_df.append(pd.read_csv('./drug_target_data/protein_domains/drugbank_transporter_domains.tsv', sep='\t'), ignore_index=True)
domain_df = domain_df.append(pd.read_csv('./drug_target_data/protein_domains/drugbank_enzyme_domains.tsv', sep='\t'), ignore_index=True)
domain_df = domain_df.append(pd.read_csv('./drug_target_data/protein_domains/drugbank_carrier_domains.tsv', sep='\t'), ignore_index=True)

domain_df.rename(columns={'Accession':'domains'}, inplace=True)

# removing all info except uniprot id from first column
domain_df = domain_df.rename(columns={'Query':'UniProt ID'})
domain_df['UniProt ID'] = domain_df['UniProt ID'].apply(lambda x: x.split()[3])

domain_df.head()

Unnamed: 0,UniProt ID,Hit type,PSSM-ID,From,To,E-Value,Bitscore,domains,Short name,Incomplete,Superfamily
0,P45059,non-specific,185060,25,584,0.0,688.094,PRK15105,PRK15105,-,cl33083
1,P45059,superfamily,185060,25,584,0.0,688.094,cl33083,PRK15105 superfamily,-,-
2,P45059,specific,223839,28,585,9.30512e-168,490.767,COG0768,FtsI,-,cl34037
3,P45059,superfamily,223839,28,585,9.30512e-168,490.767,cl34037,FtsI superfamily,-,-
4,P45059,non-specific,131269,44,588,4.29878e-110,343.718,TIGR02214,spoVD_pbp,-,cl31183


In [14]:
adj = pd.read_csv('./drug_target_data/protein_adjacency_matrix.csv')

Creating a dictionary. The key of each element in the dict is NCBI ID of a COVID protein, and the corresponding element is a list of the unique domains that the protein is related to.

In [15]:
# dict containing the unique domains for each protein
covid_protein_domains = dict()

# interating through data and adding all domains of a single protein to the dict
current = covid_domain_df['NCBI ID'][0]
covid_protein_domains[current] = []
for i, r in tqdm(covid_domain_df.iterrows()):
    # if the current protein is not the same as the previous ones
    if current != r['NCBI ID']:
        # only keeping unique domains for each protein
        covid_protein_domains[current] = list(set(covid_protein_domains[current]))
        current = r['NCBI ID']
        covid_protein_domains[current] = []
    covid_protein_domains[current].append(r['domains'])
# keeping the unique domains for the last protein bc it isn't handled in the loop
covid_protein_domains[current] = list(set(covid_protein_domains[current]))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [16]:
protein_domains = dict()

# interating through data and adding all domains of a single protein to the dict
current = domain_df['UniProt ID'][0]
protein_domains[current] = []
for i, r in tqdm(domain_df.iterrows()):
    # if the current protein is not the same as the previous ones
    if current != r['UniProt ID']:
        # only keeping unique domains for each protein
        protein_domains[current] = list(set(protein_domains[current]))
        current = r['UniProt ID']
        protein_domains[current] = []
    protein_domains[current].append(r['domains'])
# keeping the unique domains for the last protein bc it isn't handled in the loop
protein_domains[current] = list(set(protein_domains[current]))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [17]:
# creating empty adjacency matrix
covid_adj = pd.DataFrame(np.zeros((num_covid_prot, adj.shape[1]-1), dtype=np.int32), columns=list(adj)[1:])
covid_adj['NCBI ID'] = list(covid_seqs['NCBI ID'])
covid_adj.set_index('NCBI ID', inplace=True)
covid_adj.head()

Unnamed: 0_level_0,Q8II92,P08337,O70038,P05091,P07024,Q8WWQ8,P32396,Q9BX79,P31153,P16070,...,P11233,P95780,Q07817,P14090,Q9RHZ6,Q9NNW7,P0AEG4,P04070,P50914,Q9BY07
NCBI ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YP_009725297.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YP_009725298.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YP_009725299.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YP_009725300.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YP_009725301.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# creating adjacency matrix
for k, v in tqdm(covid_protein_domains.items()):
    # adding proteins to prot[] that have at least 1 domain in common with covid protein k.
    prot = []
    for key, value in protein_domains.items():
        # if the two domain lists have at least one domain in common
        if len(set(v).intersection(set(value))) > 0:
            prot.append(key)
    for p in prot:
        covid_adj.loc[k, p] = 1

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [19]:
# the number of similar proteins for each covid protein
covid_adj.sum(axis=1)

NCBI ID
YP_009725297.1      3
YP_009725298.1      3
YP_009725299.1    178
YP_009725300.1      3
YP_009725301.1      3
YP_009725302.1      3
YP_009725303.1      3
YP_009725304.1      5
YP_009725305.1      3
YP_009725306.1      3
YP_009725312.1      0
YP_009725307.1      1
YP_009725308.1    186
YP_009725309.1      1
YP_009725310.1      1
YP_009725311.1      4
dtype: int64

In [20]:
# adding domain features to vectorized features
covid_prot_features = sparse.hstack([covid_prot_features, sparse.coo_matrix(covid_adj)])
covid_prot_features

<16x13196 sparse matrix of type '<class 'numpy.int64'>'
	with 10100 stored elements in COOrdinate format>

## Drug Features

In [21]:
desc = pd.read_csv("./drug_target_data/drug_descriptors/approved_drug_descriptors.csv").drop(columns=['SMILES'])
drug_names = desc.pop('Name')

In [22]:
desc

Unnamed: 0,DrugBank ID,ALogPS_logP,ALogPS_logS,nA:(CDK2),nR:(CDK2),nN:(CDK2),nD:(CDK2),nC:(CDK2),nF:(CDK2),nQ:(CDK2),...,SYMS4:(Mersy),SYMS5X:(Mersy),SYMS5Y:(Mersy),SYMS5Z:(Mersy),SYMS5:(Mersy),SYMS6X:(Mersy),SYMS6Y:(Mersy),SYMS6Z:(Mersy),SYMS6:(Mersy),CHIR:(Mersy)
0,DB00007,1.04,-4.55,9.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.300,0.204,0.260,0.402,0.294,0.214,0.260,0.401,0.297,0.652
1,DB00014,0.30,-4.65,9.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.283,0.247,0.246,0.339,0.279,0.255,0.260,0.340,0.286,0.648
2,DB00035,-1.01,-3.99,7.0,1.0,1.0,0.0,1.0,4.0,1.0,...,0.297,0.272,0.314,0.305,0.297,0.272,0.308,0.309,0.296,0.670
3,DB00091,4.12,-5.09,10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.448,0.453,0.503,0.374,0.446,0.454,0.529,0.388,0.460,0.489
4,DB00104,0.42,-4.93,7.0,0.0,0.0,0.0,2.0,4.0,0.0,...,0.348,0.350,0.364,0.323,0.346,0.362,0.353,0.326,0.347,0.610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2299,DB15598,-0.24,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.706,0.781,0.605,0.655,0.690,0.792,0.617,0.672,0.703,0.210
2300,DB15617,-3.17,-0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.451,0.483,0.392,0.481,0.454,0.493,0.397,0.482,0.459,0.504
2301,DB15678,3.84,-3.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.676,0.585,0.566,0.756,0.647,0.601,0.582,0.766,0.661,0.237
2302,DB15685,3.03,-4.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.326,0.315,0.258,0.407,0.330,0.294,0.260,0.398,0.320,0.624


In [23]:
scaler = load('models/drug_scaler.joblib')
drug_var = load('models/drug_var.joblib')

drug_desc = scaler.transform(desc.iloc[:, 1:])
drug_desc = drug_var.transform(drug_desc)
drug_desc.shape

(2304, 6388)

## Appling Lasso Feature Selection Models

In [24]:
prot_feat_selector = load('models/prot_lasso_feature_selector.joblib')
drug_feat_selector = load('models/drug_lasso_feature_selector.joblib')



In [25]:
# the loaded selectors won't work for some reason. This is the workaround

from sklearn.feature_selection import SelectFromModel

prot_feat_selector = SelectFromModel(prot_feat_selector.estimator_, prefit=True)
prot_feat_selector.set_params(**{'threshold':1e-20})

drug_feat_selector = SelectFromModel(drug_feat_selector.estimator_, prefit=True)
drug_feat_selector.set_params(**{'threshold':1e-3})

SelectFromModel(estimator=Lasso(alpha=0.0001), prefit=True, threshold=0.001)

In [26]:
# selecting features with the lasso models
covid_prot_features = prot_feat_selector.transform(covid_prot_features)
drug_desc = drug_feat_selector.transform(drug_desc)

# adding the drug and proein ids back to the datasets
covid_prot_features = pd.DataFrame.sparse.from_spmatrix(covid_prot_features)
covid_prot_features.insert(0, 'NCBI ID', covid_seqs['NCBI ID'])
drug_desc = pd.DataFrame(drug_desc)
drug_desc.insert(0, 'Drug ID', desc['DrugBank ID'])

In [27]:
drug_desc

Unnamed: 0,Drug ID,0,1,2,3,4,5,6,7,8,...,1810,1811,1812,1813,1814,1815,1816,1817,1818,1819
0,DB00007,-1.178595,1.954138,1.977584,-5.621852,10.642382,11.499463,2.404775,-0.127535,-8.056285,...,0.64281,-0.423097,0.793447,-0.292313,6.572757,9.520198,-0.212083,-0.227748,-0.193224,-0.134968
1,DB00014,-1.918595,1.954138,1.977584,-6.951852,23.642382,11.499463,2.404775,-0.127535,-8.056285,...,0.64881,-0.383097,1.793447,-0.242313,7.592757,11.050198,-0.214083,-0.244748,-0.172224,-0.197968
2,DB00035,-3.228595,3.954138,1.977584,-6.891852,23.042382,3.499463,2.404775,-0.127535,7.943715,...,0.70081,0.126903,0.793447,0.267687,3.252757,4.640198,-0.206083,-0.238748,-0.218224,-0.231968
3,DB00091,1.901405,-0.045862,-0.022416,-4.481852,2.182382,-8.500537,-0.595225,-0.227535,-8.056285,...,0.22881,-0.953097,1.793447,-0.882313,5.682757,8.380198,-0.121083,-0.052748,0.021776,-0.162968
4,DB00104,-1.798595,3.954138,-0.022416,-3.971852,-0.767618,12.499463,1.404775,-0.127535,7.943715,...,0.17081,-1.114097,0.793447,-1.092313,3.622757,5.190198,-0.185083,-0.197748,-0.201224,-0.213968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2299,DB15598,-2.458595,-0.045862,-0.022416,-0.141852,-6.077618,-2.500537,-0.595225,-0.127535,-8.056285,...,-0.22209,-1.628097,-0.206553,-1.698313,0.888757,1.216198,0.318917,0.268252,0.205776,0.118032
2300,DB15617,-5.388595,-0.045862,-0.022416,-7.791852,33.842382,-8.500537,-0.595225,-0.127535,-8.056285,...,0.25081,0.266903,0.793447,0.337687,1.930757,2.730198,0.049917,-0.031748,-0.136224,-0.055968
2301,DB15678,1.621405,-0.045862,-0.022416,-2.331852,-6.616618,-8.500537,-0.595225,-0.327535,-8.056285,...,-0.26539,-0.953097,-0.206553,-0.992313,0.758757,1.050198,0.252917,0.070252,0.254776,0.219032
2302,DB15685,0.811405,-0.045862,-0.022416,1.108148,-1.477618,12.499463,0.404775,-0.227535,-8.056285,...,0.69381,-0.073097,0.793447,-0.032313,1.758757,2.452198,-0.234083,-0.195748,-0.114224,-0.129968


## Running Predictions

First we need to create a list of all possible DTIs between the DrugBank drugs and COVID-19 proteins. Then, we will use the trained ensemble model to predict which of these DTIs are most likely to interact.

In [28]:
# creating a list of all possible DTIs
dtis = []

for p in tqdm(covid_prot_features['NCBI ID']):
    for d in drug_desc['Drug ID']:
        dtis.append((p, d))

print(f"Created {len(dtis)} possible DTI combinations")

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))


Created 36864 possible DTI combinations


In [29]:
# adding features to the DTIs
dti_df = pd.DataFrame()
dti_df['NCBI ID'] = [i[0] for i in dtis]
dti_df['Drug ID'] = [i[1] for i in dtis]

dti_df = dti_df.merge(drug_desc, how='inner', on=['Drug ID'])
dti_df = dti_df.merge(covid_prot_features, how='inner', on=['NCBI ID'])

In [30]:
dti_df

Unnamed: 0,NCBI ID,Drug ID,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,...,1218_y,1219_y,1220_y,1221_y,1222_y,1223_y,1224_y,1225_y,1226_y,1227_y
0,YP_009725297.1,DB00007,-1.178595,1.954138,1.977584,-5.621852,10.642382,11.499463,2.404775,-0.127535,...,0,0,0,0,0,0,0,0,0,0
1,YP_009725297.1,DB00014,-1.918595,1.954138,1.977584,-6.951852,23.642382,11.499463,2.404775,-0.127535,...,0,0,0,0,0,0,0,0,0,0
2,YP_009725297.1,DB00035,-3.228595,3.954138,1.977584,-6.891852,23.042382,3.499463,2.404775,-0.127535,...,0,0,0,0,0,0,0,0,0,0
3,YP_009725297.1,DB00091,1.901405,-0.045862,-0.022416,-4.481852,2.182382,-8.500537,-0.595225,-0.227535,...,0,0,0,0,0,0,0,0,0,0
4,YP_009725297.1,DB00104,-1.798595,3.954138,-0.022416,-3.971852,-0.767618,12.499463,1.404775,-0.127535,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36859,YP_009725311.1,DB15598,-2.458595,-0.045862,-0.022416,-0.141852,-6.077618,-2.500537,-0.595225,-0.127535,...,0,0,0,0,0,0,0,0,0,1
36860,YP_009725311.1,DB15617,-5.388595,-0.045862,-0.022416,-7.791852,33.842382,-8.500537,-0.595225,-0.127535,...,0,0,0,0,0,0,0,0,0,1
36861,YP_009725311.1,DB15678,1.621405,-0.045862,-0.022416,-2.331852,-6.616618,-8.500537,-0.595225,-0.327535,...,0,0,0,0,0,0,0,0,0,1
36862,YP_009725311.1,DB15685,0.811405,-0.045862,-0.022416,1.108148,-1.477618,12.499463,0.404775,-0.227535,...,0,0,0,0,0,0,0,0,0,1


### Machine Learning Predictions

In [30]:
from tensorflow import keras

In [31]:
forest = load('models/random_forest.joblib')
cnn = keras.models.load_model('models/model2CNN')



In [32]:
feat = dti_df.iloc[:, 2:]
feat.head()

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,1218_y,1219_y,1220_y,1221_y,1222_y,1223_y,1224_y,1225_y,1226_y,1227_y
0,-1.178595,1.954138,1.977584,-5.621852,10.642382,11.499463,2.404775,-0.127535,-8.056285,0.606547,...,0,0,0,0,0,0,0,0,0,0
1,-1.918595,1.954138,1.977584,-6.951852,23.642382,11.499463,2.404775,-0.127535,-8.056285,0.606547,...,0,0,0,0,0,0,0,0,0,0
2,-3.228595,3.954138,1.977584,-6.891852,23.042382,3.499463,2.404775,-0.127535,7.943715,1.706547,...,0,0,0,0,0,0,0,0,0,0
3,1.901405,-0.045862,-0.022416,-4.481852,2.182382,-8.500537,-0.595225,-0.227535,-8.056285,1.306547,...,0,0,0,0,0,0,0,0,0,0
4,-1.798595,3.954138,-0.022416,-3.971852,-0.767618,12.499463,1.404775,-0.127535,7.943715,1.406547,...,0,0,0,0,0,0,0,0,0,0


In [33]:
forest_pred = forest.predict_proba(feat)[:, 1].reshape(feat.shape[0], 1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [34]:
reshape_cnn_data = lambda ds: ds.to_numpy().reshape((ds.shape[0], ds.shape[1], 1))

cnn_pred = cnn.predict(reshape_cnn_data(feat), use_multiprocessing=True, verbose=True)



In [35]:
avg = np.mean([forest_pred, cnn_pred], axis=0).T[0]

Forest predictions

In [51]:
forest_covid_pred = dti_df.iloc[np.where(forest_pred >= 0.90)[0], :][['NCBI ID', 'Drug ID']]

In [59]:
# adding drug names to descriptors so the predicted drugs can be identified by name
desc['Name'] = drug_names

# adding drug names to predictions
forest_covid_pred = forest_covid_pred.merge(desc[['Name', 'DrugBank ID']], how='inner', left_on='Drug ID', right_on='DrugBank ID').drop(columns=['DrugBank ID'])

In [60]:
set(forest_covid_pred['Drug ID'])

{'DB00143', 'DB00157', 'DB01133', 'DB03147', 'DB03247', 'DB12010'}

In [68]:
forest_covid_pred.loc[forest_covid_pred['Drug ID'] == 'DB12010']

Unnamed: 0,NCBI ID,Drug ID,Name
11,YP_009725297.1,DB12010,fostamatinib
12,YP_009725303.1,DB12010,fostamatinib
13,YP_009725304.1,DB12010,fostamatinib
14,YP_009725305.1,DB12010,fostamatinib
15,YP_009725306.1,DB12010,fostamatinib


Using the CNN predictions.

In [75]:
covid_pred = dti_df.iloc[np.where(cnn_pred >= 0.97)[0], :][['NCBI ID', 'Drug ID']]

In [76]:
# adding drug names to predictions
covid_pred = covid_pred.merge(desc[['Name', 'DrugBank ID']], how='inner', left_on='Drug ID', right_on='DrugBank ID').drop(columns=['DrugBank ID'])

In [77]:
covid_drugs = covid_pred[['Drug ID', 'Name']].drop_duplicates()
len(covid_drugs)

29

In [78]:
# writing drugs to file
covid_drugs.to_csv('results/drugs_thresh97.tsv', sep='\t', index=False)

In [79]:
# adding protein names to dtis
covid_pred = covid_pred.merge(covid_seqs[['NCBI ID', 'nsp']], how='inner', on='NCBI ID')

# writing dtis to file
covid_pred.drop(columns=['NCBI ID']).to_csv('results/dtis_thresh97.tsv', sep='\t', index=False)