In [1]:
import requests
import xmltodict
from rdkit import Chem
from Bio import SeqIO
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import random

In [23]:
# larger storage locations
larg_stor = "/media/biofarmaka/545ed5cd-03d4-4e08-bd8d-f43a31b40413/download/rom"

# Data Acquisitions

In [3]:
# read protein data
proteins = pd.read_csv("dataset/omni_prot.csv")
proteins

Unnamed: 0,uniprot_id,protein_name,gene_symbol
0,Q7L5D6,guided entry of tail-anchored proteins factor 4,GET4
1,Q9HA92,radical S-adenosyl methionine domain containing 1,RSAD1
2,Q9P2J8,zinc finger protein 624,ZNF624
3,P21359,neurofibromin 1,NF1
4,Q6ZSC3,RNA binding motif protein 43,RBM43
...,...,...,...
7094,Q9Y6F6,IRAG1 antisense RNA 1,IRAG1
7095,P10914,IRF1 antisense RNA 1,IRF1
7096,Q8TDW7,Fat3,FAT3
7097,P20823,HNF1A antisense RNA 1,HNF1A


In [4]:
# remove duplicates
proteins.drop_duplicates(subset=["uniprot_id"], inplace=True)
proteins.reset_index(drop=True, inplace=True)
proteins

Unnamed: 0,uniprot_id,protein_name,gene_symbol
0,Q7L5D6,guided entry of tail-anchored proteins factor 4,GET4
1,Q9HA92,radical S-adenosyl methionine domain containing 1,RSAD1
2,Q9P2J8,zinc finger protein 624,ZNF624
3,P21359,neurofibromin 1,NF1
4,Q6ZSC3,RNA binding motif protein 43,RBM43
...,...,...,...
5863,Q9Y6F6,IRAG1 antisense RNA 1,IRAG1
5864,P10914,IRF1 antisense RNA 1,IRF1
5865,Q8TDW7,Fat3,FAT3
5866,P20823,HNF1A antisense RNA 1,HNF1A


In [5]:
# list of uniprot ids
uni_ids = proteins["uniprot_id"].tolist()
len(uni_ids)

5868

In [6]:
# function to get interactions data from BDB
def get_interactions_frm_bdb(uniprot_ids:list, affinity_threshold:int)->pd.DataFrame:
    
    """
    this function performs interactions data retrieval from BindingDB web services
    
    INPUT:
    uniprot_ids: list of uniprot id of proteins that you want to search the interactions
    affinity_threshold: threshold value used to cut off results based on binding affinity score
    
    OUTPUT:
    a Pandas Dataframe object consists of:
    - protein uniprot id
    - canonical SMILES of corresponding compound
    - isomeric SMILES of corresponding compound
    - binding affinity score type
    - binding affinity scores
    
    """
    
    protein_id = []
    canonical_smiles = []
    aff_types = []
    affinities = []
    iso_smiles = []
    
    for uni_id in tqdm(uniprot_ids, desc="protein loop"):
        
        url = f"https://bindingdb.org/axis2/services/BDBService/getLigandsByUniprot?uniprot={uni_id};{affinity_threshold}"
                
        try:
            
            response = requests.get(url)
        
            xml_response = xmltodict.parse(response.content)
            
            response_length = len(xml_response["bdb:getLigandsByUniprotResponse"]["bdb:affinities"])

            if response_length != 0:

                for i in range(response_length):

                    bdb_smiles = xml_response["bdb:getLigandsByUniprotResponse"]["bdb:affinities"][i]["bdb:smiles"].split("|")[0].replace(" ", "")
                    iso_smiles.append(bdb_smiles)
                    
                    try:
                        mol = Chem.MolFromSmiles(bdb_smiles)                        
                        can_smiles = Chem.MolToSmiles(mol, isomericSmiles=False)
                        canonical_smiles.append(can_smiles)
                    except:
                        canonical_smiles.append(np.NaN)

                    
                    aff_type = xml_response["bdb:getLigandsByUniprotResponse"]["bdb:affinities"][i]["bdb:affinity_type"]
                    aff_types.append(aff_type)

                    affinity = xml_response["bdb:getLigandsByUniprotResponse"]["bdb:affinities"][i]["bdb:affinity"]
                    affinities.append(affinity)

                    protein_id.append(uni_id)

            else:
                print("affinities are of 0 length")
                canonical_smiles.append(np.NaN)
                aff_types.append(np.NaN)
                affinities.append(np.NaN)
                protein_id.append(uni_id)
                iso_smiles.append(np.NaN)
            
        except (KeyError, IndexError):
            
#             print("No Hit!")
            canonical_smiles.append(np.NaN)
            aff_types.append(np.NaN)
            affinities.append(np.NaN)
            protein_id.append(uni_id)
            iso_smiles.append(np.NaN)
        
        except requests.exceptions.HTTPError as e:
            
            print(e)
            canonical_smiles.append(np.NaN)
            aff_types.append(np.NaN)
            affinities.append(np.NaN)
            protein_id.append(uni_id)
            iso_smiles.append(np.NaN)
            
        except Exception as e:
            
            print(e)
            canonical_smiles.append(np.NaN)
            aff_types.append(np.NaN)
            affinities.append(np.NaN)
            protein_id.append(uni_id)
            iso_smiles.append(np.NaN)
                
    interactions_data = {
        "uniprot_id": protein_id,
        "canonical_smiles": canonical_smiles,
        "isomeric_smiles": iso_smiles,
        "affinity_type": aff_types,
        "affinity_score": affinities,
    }
    
    return interactions_data

In [9]:
# run data acquisition
# affinity threshold = 10 micro Molar

interactions_dict = get_interactions_frm_bdb(uniprot_ids=uni_ids, affinity_threshold=10)

protein loop:   0%|          | 0/5868 [00:00<?, ?it/s]

RDKit ERROR: [12:34:37] Explicit valence for atom # 2 N, 4, is greater than permitted
RDKit ERROR: [12:36:06] SMILES Parse Error: syntax error while parsing: [#6]-[#8]-c1ccc(cc1-[#7]-[#6](=O)-[#7]-c1ccc(-[#8]-[#6]-[#6]-[#7]-2-[#6]-[#6]-[#8]-[#6]-[#6]-2)c2ccccc12)[Si;v4]([#6])([#6])[#6]
RDKit ERROR: [12:36:06] SMILES Parse Error: Failed parsing SMILES '[#6]-[#8]-c1ccc(cc1-[#7]-[#6](=O)-[#7]-c1ccc(-[#8]-[#6]-[#6]-[#7]-2-[#6]-[#6]-[#8]-[#6]-[#6]-2)c2ccccc12)[Si;v4]([#6])([#6])[#6]' for input: '[#6]-[#8]-c1ccc(cc1-[#7]-[#6](=O)-[#7]-c1ccc(-[#8]-[#6]-[#6]-[#7]-2-[#6]-[#6]-[#8]-[#6]-[#6]-2)c2ccccc12)[Si;v4]([#6])([#6])[#6]'
RDKit ERROR: [12:36:06] SMILES Parse Error: syntax error while parsing: [#6]-c1ccc(cc1)-n1nc(cc1-[#7]-[#6](=O)-[#7]-c1ccc(-[#8]-[#6]-[#6]-[#7]-2-[#6]-[#6]-[#8]-[#6]-[#6]-2)c2ccccc12)[Si;v4]([#6])([#6])[#6]
RDKit ERROR: [12:36:06] SMILES Parse Error: Failed parsing SMILES '[#6]-c1ccc(cc1)-n1nc(cc1-[#7]-[#6](=O)-[#7]-c1ccc(-[#8]-[#6]-[#6]-[#7]-2-[#6]-[#6]-[#8]-[#6]-[#6]-2)c

RDKit ERROR: [12:39:47] SMILES Parse Error: syntax error while parsing: [#6]-c1ccc([Te;v2][#6]-c2ccc(cc2)S([#7])(=O)=O)cc1
RDKit ERROR: [12:39:47] SMILES Parse Error: Failed parsing SMILES '[#6]-c1ccc([Te;v2][#6]-c2ccc(cc2)S([#7])(=O)=O)cc1' for input: '[#6]-c1ccc([Te;v2][#6]-c2ccc(cc2)S([#7])(=O)=O)cc1'
RDKit ERROR: [12:39:47] SMILES Parse Error: syntax error while parsing: [#6]-[#8]-c1ccc([Te;v2][#6]-c2ccc(cc2)S([#7])(=O)=O)cc1
RDKit ERROR: [12:39:47] SMILES Parse Error: Failed parsing SMILES '[#6]-[#8]-c1ccc([Te;v2][#6]-c2ccc(cc2)S([#7])(=O)=O)cc1' for input: '[#6]-[#8]-c1ccc([Te;v2][#6]-c2ccc(cc2)S([#7])(=O)=O)cc1'
RDKit ERROR: [12:39:47] SMILES Parse Error: syntax error while parsing: [#6]-c1ccccc1[Te;v2][#6]-c1ccc(cc1)S([#7])(=O)=O
RDKit ERROR: [12:39:47] SMILES Parse Error: Failed parsing SMILES '[#6]-c1ccccc1[Te;v2][#6]-c1ccc(cc1)S([#7])(=O)=O' for input: '[#6]-c1ccccc1[Te;v2][#6]-c1ccc(cc1)S([#7])(=O)=O'
RDKit ERROR: [12:39:47] SMILES Parse Error: syntax error while parsing: [

RDKit ERROR: [12:40:36] SMILES Parse Error: syntax error while parsing: [#7]-[#6]-[#6]-[#6]-[#6]-[#6@@H](-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6]-[#6]-[#6]-[#6]-[#6]-[#7]-[#6](=O)-[#6]-[#6]-[#6](=O)-[#7]-c1ncc([se;v2]1)-c1ccc(cc1)-[#6](-[#8])=O)-[#6](-[#8])=O
RDKit ERROR: [12:40:36] SMILES Parse Error: Failed parsing SMILES '[#7]-[#6]-[#6]-[#6]-[#6]-[#6@@H](-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6@@H](-[#6]-[#6](-[#8])=O)-[#7]-[#6](=O)-[#6]-[#6]-[#6]-[#6]-[#6]-[#7]-[#6](=O)-[#6]-[#6]-[#6](=O)-[#7]-c1ncc([se;v2]1)-c1ccc(cc1)-[#6](-[#8])=O)-[#6](-[#8])=O' for input:

RDKit ERROR: 
RDKit ERROR: [12:49:48] Can't kekulize mol.  Unkekulized atoms: 1 3 6
RDKit ERROR: 
RDKit ERROR: [12:49:48] Can't kekulize mol.  Unkekulized atoms: 1 3 6
RDKit ERROR: 
RDKit ERROR: [12:49:48] Can't kekulize mol.  Unkekulized atoms: 1 3 6
RDKit ERROR: 
RDKit ERROR: [12:49:48] Can't kekulize mol.  Unkekulized atoms: 2 4 7
RDKit ERROR: 
RDKit ERROR: [12:49:48] Can't kekulize mol.  Unkekulized atoms: 1 3 6
RDKit ERROR: 
RDKit ERROR: [12:49:48] Can't kekulize mol.  Unkekulized atoms: 1 3 6
RDKit ERROR: 
RDKit ERROR: [12:49:48] Can't kekulize mol.  Unkekulized atoms: 1 3 6
RDKit ERROR: 
RDKit ERROR: [12:49:48] Explicit valence for atom # 3 F, 3, is greater than permitted
RDKit ERROR: [12:49:48] Can't kekulize mol.  Unkekulized atoms: 34 35 38
RDKit ERROR: 
RDKit ERROR: [12:49:48] Explicit valence for atom # 4 N, 4, is greater than permitted
RDKit ERROR: [12:49:48] Can't kekulize mol.  Unkekulized atoms: 1 3 6
RDKit ERROR: 
RDKit ERROR: [12:49:48] Can't kekulize mol.  Unkekulize

RDKit ERROR: [13:24:08] SMILES Parse Error: syntax error while parsing: [#7]S(=O)(=O)c1ccc(-[#6][Se;v2]c2ccc(-[#7]-[#6](=O)-[#6]-[#7]-3-[#6](=O)-[#7]C4([#6]-[#6]-[#6]-[#6]-[#6]4)[#6]-3=O)cc2)cc1
RDKit ERROR: [13:24:08] SMILES Parse Error: Failed parsing SMILES '[#7]S(=O)(=O)c1ccc(-[#6][Se;v2]c2ccc(-[#7]-[#6](=O)-[#6]-[#7]-3-[#6](=O)-[#7]C4([#6]-[#6]-[#6]-[#6]-[#6]4)[#6]-3=O)cc2)cc1' for input: '[#7]S(=O)(=O)c1ccc(-[#6][Se;v2]c2ccc(-[#7]-[#6](=O)-[#6]-[#7]-3-[#6](=O)-[#7]C4([#6]-[#6]-[#6]-[#6]-[#6]4)[#6]-3=O)cc2)cc1'
RDKit ERROR: [13:24:08] SMILES Parse Error: syntax error while parsing: [#6]-[#6]-1-[#6]-[#6]-[#6]C2([#6]-1)[#7]-[#6](=O)-[#7](-[#6]-[#6](=O)-[#7]-c1ccc([Se;v2][#6]-c3ccc(cc3)S([#7])(=O)=O)cc1)-[#6]2=O
RDKit ERROR: [13:24:08] SMILES Parse Error: Failed parsing SMILES '[#6]-[#6]-1-[#6]-[#6]-[#6]C2([#6]-1)[#7]-[#6](=O)-[#7](-[#6]-[#6](=O)-[#7]-c1ccc([Se;v2][#6]-c3ccc(cc3)S([#7])(=O)=O)cc1)-[#6]2=O' for input: '[#6]-[#6]-1-[#6]-[#6]-[#6]C2([#6]-1)[#7]-[#6](=O)-[#7](-[#6]-[#6](

RDKit ERROR: [13:25:48] Can't kekulize mol.  Unkekulized atoms: 40 41 42 43 44
RDKit ERROR: 
RDKit ERROR: [13:25:48] Can't kekulize mol.  Unkekulized atoms: 29 30 32 33 35
RDKit ERROR: 
RDKit ERROR: [13:25:48] Can't kekulize mol.  Unkekulized atoms: 17 18 20 21 23
RDKit ERROR: 
RDKit ERROR: [13:28:07] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 7 9 10
RDKit ERROR: 
RDKit ERROR: [13:28:07] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [13:28:07] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 9 10 11 12 26 27 28
RDKit ERROR: 
RDKit ERROR: [13:28:07] Can't kekulize mol.  Unkekulized atoms: 15 16 17 19 20 21 22 23 24
RDKit ERROR: 
RDKit ERROR: [13:28:14] Explicit valence for atom # 2 N, 4, is greater than permitted
RDKit ERROR: [13:32:09] Explicit valence for atom # 19 C, 6, is greater than permitted
RDKit ERROR: [13:32:09] Explicit valence for atom # 4 C, 6, is greater than permitted
RDKit ERROR: [13:32:09] Explicit valence for atom # 8 C, 6, is greater t

RDKit ERROR: [13:59:01] Explicit valence for atom # 29 F, 4, is greater than permitted
RDKit ERROR: [13:59:54] Explicit valence for atom # 17 N, 4, is greater than permitted
RDKit ERROR: [14:00:08] Explicit valence for atom # 26 N, 4, is greater than permitted
RDKit ERROR: [14:01:15] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12
RDKit ERROR: 
RDKit ERROR: [14:01:15] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 13
RDKit ERROR: 
RDKit ERROR: [14:01:15] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12
RDKit ERROR: 
RDKit ERROR: [14:01:15] Can't kekulize mol.  Unkekulized atoms: 8 9 10 14 15
RDKit ERROR: 
RDKit ERROR: [14:01:15] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12
RDKit ERROR: 
RDKit ERROR: [14:01:15] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 13
RDKit ERROR: 
RDKit ERROR: [14:01:15] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 13
RDKit ERROR: 
RDKit ERROR: [14:01:15] Can't kekulize mol.  Unkekulized atoms: 7 8 9 10 11
RDKit ERROR: 
RDKit ERR

RDKit ERROR: [14:21:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
RDKit ERROR: 
RDKit ERROR: [14:21:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
RDKit ERROR: 
RDKit ERROR: [14:21:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
RDKit ERROR: 
RDKit ERROR: [14:21:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
RDKit ERROR: 
RDKit ERROR: [14:21:34] Can't kekulize mol.  Unkekulized atoms: 25 26 27 28 29 30 32 33 34
RDKit ERROR: 
RDKit ERROR: [14:21:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
RDKit ERROR: 
RDKit ERROR: [14:21:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
RDKit ERROR: 
RDKit ERROR: [14:21:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
RDKit ERROR: 
RDKit ERROR: [14:21:34] Can't kekulize mol.  Unkekulized atoms: 25 26 27 28 29 30 32 33 34
RDKit ERROR: 
RDKit ERROR: [14:21:34] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 8 9
RDKit ERROR: 


In [10]:
interactions_df = pd.DataFrame(interactions_dict)
print(interactions_df.shape)

(145199, 5)


In [11]:
interactions_df.isna().sum()

uniprot_id             1
canonical_smiles    5781
isomeric_smiles     5318
affinity_type       5318
affinity_score      5318
dtype: int64

In [12]:
interactions_df.dropna(subset=["uniprot_id", "isomeric_smiles"], inplace=True)
interactions_df.reset_index(drop=True, inplace=True)
interactions_df

Unnamed: 0,uniprot_id,canonical_smiles,isomeric_smiles,affinity_type,affinity_score
0,Q9UEE5,CNC1CC2OC(C)(C1OC)n1c3ccccc3c3c4c(c5c6ccccc6n2...,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,IC50,3.0
1,Q9UEE5,CCN(CC)CCNC(=O)c1c(C)[nH]c(C=C2C(=O)Nc3ccc(F)c...,CCN(CC)CCNC(=O)c1c(C)[nH]c(\C=C2/C(=O)Nc3ccc(F...,Kd,1
2,Q9UEE5,COc1ccc(-c2cnc3scc(NC(=O)C4CCCCC4)c3c2)cc1OC,COc1ccc(cc1OC)-c1cnc2scc(NC(=O)C3CCCCC3)c2c1,Kd,4.9
3,Q9UEE5,CC12OC(CC1(O)CO)n1c3ccccc3c3c4c(c5c6ccccc6n2c5...,C[C@]12O[C@H](C[C@]1(O)CO)n1c3ccccc3c3c4C(=O)N...,Kd,3.4
4,Q9UEE5,O=C(c1ccc(C=Cc2n[nH]c3ccccc23)cc1)N1CCNCC1,O=C(N1CCNCC1)c1ccc(\C=C\c2n[nH]c3ccccc23)cc1,Kd,2.9
...,...,...,...,...,...
139876,P10826,CC1(C)CCC(C)(C)c2cc(-c3cc(-c4ccc(C(=O)O)cc4)cc...,CC1(C)CCC(C)(C)c2cc(ccc12)-c1cc(ccc1OCCCCO)-c1...,EC50,6.8
139877,P10826,Cc1ccc(C)c2oc(-c3nc(-c4ccc(C(=O)O)c(F)c4)no3)cc12,Cc1ccc(C)c2oc(cc12)-c1nc(no1)-c1ccc(C(O)=O)c(F)c1,EC50,2.20
139878,P10826,Cc1ccc(C)c2oc(-c3nc(-c4ccc(C(=O)O)cc4)no3)cc12,Cc1ccc(C)c2oc(cc12)-c1nc(no1)-c1ccc(cc1)C(O)=O,EC50,1.94
139879,P10826,O=C(O)c1ccc(-c2noc(-c3cc4c(C(F)(F)F)ccc(F)c4o3...,OC(=O)c1ccc(cc1)-c1noc(n1)-c1cc2c(ccc(F)c2o1)C...,EC50,2.50


In [13]:
print("number of unique protein that has interactions data", interactions_df["uniprot_id"].nunique())
print("number of unique compound that has interactions with cancer proteins", interactions_df["isomeric_smiles"].nunique())

number of unique protein that has interactions data 550
number of unique compound that has interactions with cancer proteins 106981


In [14]:
interactions_df.to_csv("dataset/phase3_all_in.csv", index=False)

# Generate negative samples

In [16]:
positive_df = interactions_df.drop(["affinity_type", "affinity_score"], axis=1).copy()

In [17]:
# function to generate negative samples
def generate_decoy_pairs(positive_df:pd.DataFrame)->list:

    decoy_pairs = []

    for i in tqdm(range(len(positive_df))):

        drug = random.choice(positive_df["isomeric_smiles"].tolist())
        target = random.choice(positive_df["uniprot_id"].tolist())
        decoy_pairs.append((target, drug))

    return decoy_pairs

In [18]:
decoy_pairs = generate_decoy_pairs(positive_df)

  0%|          | 0/139881 [00:00<?, ?it/s]

In [20]:
decoy_df = pd.DataFrame(decoy_pairs)
decoy_df.columns = ["uniprot_id", "isomeric_smiles"]
decoy_df["class"] = [int(0) for i in range(len(decoy_df))]
decoy_df

Unnamed: 0,uniprot_id,isomeric_smiles,class
0,P42345,C[C@]12C[C@@H](C3=C4CCC(=O)C=C4CC[C@H]3[C@@H]1...,0
1,P36897,NC(=O)CN1CCC(CC1)c1ccc(Nc2ncc3ccc(-c4cnccn4)n3...,0
2,P34972,CC(C)([C@@H](c1ccccc1)c1ccc2n(ncc2c1)-c1ccc(F)...,0
3,P08581,Cc1nc2cccc(-c3cc4c(CCNC4=O)[nH]3)c2nc1N[C@H]1C...,0
4,P03372,O[C@H]1CCN(C1)c1ncc(cc1-c1cncnc1)C(=O)Nc1ccc(O...,0
...,...,...,...
139876,P11309,CO[C@H]1\C=C\C[C@H](C)C[S@@](=O)(NC(=O)c2cnc(O...,0
139877,P12931,[H][C@@]1(CC[C@@H](CC1)N1CC(C1)OC(F)F)[C@@H](C...,0
139878,Q9NZQ7,[#6]\[#6](-[#6])=[#6]\[#6]-c1c(-[#8])cc(-[#8])...,0
139879,Q06187,Nc1nccn2c(nc(-c3ccc(cc3)C(O)(C(F)F)c3cccc(c3)C...,0


In [22]:
# give class label to positive data as well
positive_df["class"] = [int(1) for i in range(len(positive_df))]
positive_df

Unnamed: 0,uniprot_id,canonical_smiles,isomeric_smiles,class
0,Q9UEE5,CNC1CC2OC(C)(C1OC)n1c3ccccc3c3c4c(c5c6ccccc6n2...,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,1
1,Q9UEE5,CCN(CC)CCNC(=O)c1c(C)[nH]c(C=C2C(=O)Nc3ccc(F)c...,CCN(CC)CCNC(=O)c1c(C)[nH]c(\C=C2/C(=O)Nc3ccc(F...,1
2,Q9UEE5,COc1ccc(-c2cnc3scc(NC(=O)C4CCCCC4)c3c2)cc1OC,COc1ccc(cc1OC)-c1cnc2scc(NC(=O)C3CCCCC3)c2c1,1
3,Q9UEE5,CC12OC(CC1(O)CO)n1c3ccccc3c3c4c(c5c6ccccc6n2c5...,C[C@]12O[C@H](C[C@]1(O)CO)n1c3ccccc3c3c4C(=O)N...,1
4,Q9UEE5,O=C(c1ccc(C=Cc2n[nH]c3ccccc23)cc1)N1CCNCC1,O=C(N1CCNCC1)c1ccc(\C=C\c2n[nH]c3ccccc23)cc1,1
...,...,...,...,...
139876,P10826,CC1(C)CCC(C)(C)c2cc(-c3cc(-c4ccc(C(=O)O)cc4)cc...,CC1(C)CCC(C)(C)c2cc(ccc12)-c1cc(ccc1OCCCCO)-c1...,1
139877,P10826,Cc1ccc(C)c2oc(-c3nc(-c4ccc(C(=O)O)c(F)c4)no3)cc12,Cc1ccc(C)c2oc(cc12)-c1nc(no1)-c1ccc(C(O)=O)c(F)c1,1
139878,P10826,Cc1ccc(C)c2oc(-c3nc(-c4ccc(C(=O)O)cc4)no3)cc12,Cc1ccc(C)c2oc(cc12)-c1nc(no1)-c1ccc(cc1)C(O)=O,1
139879,P10826,O=C(O)c1ccc(-c2noc(-c3cc4c(C(F)(F)F)ccc(F)c4o3...,OC(=O)c1ccc(cc1)-c1noc(n1)-c1cc2c(ccc(F)c2o1)C...,1


In [27]:
phase3_df = pd.concat([positive_df, decoy_df])
phase3_df.drop(["canonical_smiles"], axis=1, inplace=True)
phase3_df.reset_index(drop=True, inplace=True)
phase3_df

Unnamed: 0,uniprot_id,isomeric_smiles,class
0,Q9UEE5,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,1
1,Q9UEE5,CCN(CC)CCNC(=O)c1c(C)[nH]c(\C=C2/C(=O)Nc3ccc(F...,1
2,Q9UEE5,COc1ccc(cc1OC)-c1cnc2scc(NC(=O)C3CCCCC3)c2c1,1
3,Q9UEE5,C[C@]12O[C@H](C[C@]1(O)CO)n1c3ccccc3c3c4C(=O)N...,1
4,Q9UEE5,O=C(N1CCNCC1)c1ccc(\C=C\c2n[nH]c3ccccc23)cc1,1
...,...,...,...
279757,P11309,CO[C@H]1\C=C\C[C@H](C)C[S@@](=O)(NC(=O)c2cnc(O...,0
279758,P12931,[H][C@@]1(CC[C@@H](CC1)N1CC(C1)OC(F)F)[C@@H](C...,0
279759,Q9NZQ7,[#6]\[#6](-[#6])=[#6]\[#6]-c1c(-[#8])cc(-[#8])...,0
279760,Q06187,Nc1nccn2c(nc(-c3ccc(cc3)C(O)(C(F)F)c3cccc(c3)C...,0


In [30]:
# remove duplicates
phase3_df.drop_duplicates(inplace=True)

# Fetch FASTA for each protein

In [31]:
# function to grab fasta from uniprot
def get_fasta(uniprot_ids:list)->pd.DataFrame:
    
    fasta = []
    
    # get unique id
    uniprot_ids = set(uniprot_ids)
    
    for protein in tqdm((uniprot_ids)):
        
        url = f"https://www.uniprot.org/uniprot/{protein}.fasta"
        
        try:
            
            response = requests.get(url)
            
            with open("fasta.fasta", "w") as f:
                
                f.write(response.content.decode("UTF-8"))
                f.close()
                
            fasta_read = SeqIO.read("fasta.fasta", "fasta")
            fasta.append(fasta_read)
        
        except Exception as e:
            
            print(e)
            fasta.append(np.NaN)
            
    fasta_df = {
        "uniprot_id": uniprot_ids,
        "fasta": fasta
    }
    
    protein_name = []
    fasta_seq = []

    for i in tqdm(range(len(fasta_df["fasta"]))):

        protein_name.append(fasta_df["fasta"][i].id.split("|")[2])

        fasta_seq.append(str(fasta_df["fasta"][i].seq))
    
    
    return protein_name, fasta_seq

In [32]:
protein_name, fasta_seq = get_fasta(phase3_df["uniprot_id"])

  0%|          | 0/550 [00:00<?, ?it/s]

  0%|          | 0/550 [00:00<?, ?it/s]

In [33]:
phase3_fasta_df = pd.DataFrame([protein_name, fasta_seq]).T.reset_index(drop=True)
phase3_fasta_df.columns = ["protein_name", "fasta"]
phase3_fasta_df["uniprot_id"] = phase3_df["uniprot_id"].unique()
phase3_fasta_df

Unnamed: 0,protein_name,fasta,uniprot_id
0,NOD2_HUMAN,MGEEGGSASHDEEERASVLLGHSPGCEMCSQEAFQAQRSQLVELLV...,Q9UEE5
1,BRD3_HUMAN,MSTATTVAPAGIPATPGPVNPPPPEVSNPSKPGRKTNQLQYMQNVV...,P36507
2,PSA3_HUMAN,MSSIGTGYDLSASTFSPDGRVFQVEYAMKAVENSSTAIGIRCKDGV...,P25054
3,SIR1_HUMAN,MADEAALALQPGGSPSAAGADREAASSPAGEPLRKRPRRDGPGLER...,P09467
4,AK1C3_HUMAN,MDSKHQCVKLNDGHFMPVLGFGTYAPPEVPRSKALEVTKLAIEAGF...,Q03164
...,...,...,...
545,PPIA_HUMAN,MVNPTVFFDIAVDGEPLGRVSFELFADKVPKTAENFRALSTGEKGF...,P40763
546,PAK2_HUMAN,MSDNGELEDKPPAPPVRMSSTIFSTGGKDPLSANHSLKPLPSVPEE...,P33261
547,ROCK2_HUMAN,MSRPPPTGKMPGAPETAPGDGAGASRQRKLEALIRDPRSPINVESL...,P21709
548,IMDH1_HUMAN,MADYLISGGTGYVPEDGLTAQQLFASADGLTYNDFLILPGFIDFIA...,P22607


In [40]:
phase3_fasta_df.to_csv("dataset/phase3_fasta.csv", index=False)

In [37]:
phase3_df.to_csv("dataset/phase3_df.csv", index=False)