In [None]:
# Mount google drive at /content/drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Set seeds
import numpy as np
import tensorflow as tf
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# Fetching PCP properties of druggable and non-druggable proteins
import json

data_file_path = "/content/drive/MyDrive/protein_props/features/protein_props.json"
druggable_proteins_file_path = "/content/drive/MyDrive/protein_props/druggable_proteins.txt"
approved_druggable_proteins_file_path = "/content/drive/MyDrive/protein_props/approved_druggable_proteins.txt"

with open(data_file_path, 'r') as f:
    protein_data = json.load(f)

print("Total number of uniprot human verified proteins:", len(protein_data))

with open(druggable_proteins_file_path, 'r') as f:
    druggable_proteins = f.read().splitlines()

with open(approved_druggable_proteins_file_path, 'r') as f:
    approved_druggable_proteins = f.read().splitlines()

print("Number of druggable proteins:", len(druggable_proteins))
print("Number of approved druggable proteins:", len(approved_druggable_proteins))


Total number of uniprot human verified proteins: 20434
Number of druggable proteins: 3345
Number of approved druggable proteins: 2652


In [None]:
import pandas as pd

properties = (pd.read_json("/content/drive/MyDrive/protein_props/features/protein_props.json")).transpose()
is_druggable = [1 if i in druggable_proteins else 0 for i in properties.index]
is_approved_druggable = [1 if i in approved_druggable_proteins else 0 for i in properties.index]

properties["is_druggable"] = is_druggable
properties["is_approved_druggable"] = is_approved_druggable

### All physicochemical properties (32)

In [None]:
PCP_properties = properties.copy()
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
amino_acid_percent = {i:[] for i in amino_acids}
for i in PCP_properties['Amino Acid Percent']:
  for aa in amino_acids:
    amino_acid_percent[aa].append(i[aa])
for aa in amino_acids:
  PCP_properties = pd.concat([PCP_properties, pd.Series(amino_acid_percent[aa], index = PCP_properties.index, name = f"Amino Acid Percent {aa}")], axis = 1)

PCP_properties[f"Molar Extinction Coefficient 1"] = pd.Series([x[0] for x in PCP_properties['Molar Extinction Coefficient']], index = PCP_properties.index)
PCP_properties[f"Molar Extinction Coefficient 2"] = pd.Series([x[1] for x in PCP_properties['Molar Extinction Coefficient']], index = PCP_properties.index)

PCP_properties[f"Secondary Structure helix"] = pd.Series([x[0] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)
PCP_properties[f"Secondary Structure turn"] = pd.Series([x[1] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)
PCP_properties[f"Secondary Structure sheet"] = pd.Series([x[2] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)

PCP_properties.drop(columns = ['Amino Acid Count','Amino Acid Percent',"Molar Extinction Coefficient","Flexibility","Secondary Structure",'Sequence'], inplace = True)
PCP_properties['Sequence Length'] = PCP_properties['Sequence Length'].astype(int)
PCP_properties[['Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7']] = PCP_properties[['Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7']].astype(float)

In [None]:
PCP_properties.describe()

Unnamed: 0,Sequence Length,Molecular Weight,GRAVY,Isoelectric Point,Instability Index,Aromaticity,Charge at 7,is_druggable,is_approved_druggable,Amino Acid Percent A,...,Amino Acid Percent S,Amino Acid Percent T,Amino Acid Percent V,Amino Acid Percent W,Amino Acid Percent Y,Molar Extinction Coefficient 1,Molar Extinction Coefficient 2,Secondary Structure helix,Secondary Structure turn,Secondary Structure sheet
count,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,...,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0
mean,558.350152,62188.38,-0.3436,7.329772,49.419399,0.079209,-1.912736,0.163698,0.129784,0.072052,...,0.080244,0.05186,0.059924,0.013143,0.027888,59385.39,60156.46,0.320547,0.289585,0.335513
std,606.567158,67084.17,0.430772,1.754309,13.373352,0.030165,22.825295,0.37001,0.336074,0.027747,...,0.027054,0.018497,0.01962,0.009426,0.015154,65547.8,66260.62,0.054092,0.053513,0.069216
min,2.0,260.2869,-3.4325,4.050028,-42.94,0.0,-447.417559,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,249.0,27751.25,-0.604473,5.802119,40.892744,0.058824,-8.026765,0.0,0.0,0.053946,...,0.062886,0.041353,0.047138,0.0065,0.018116,24870.0,25218.75,0.289855,0.255047,0.292308
50%,415.0,46198.1,-0.362563,7.059653,48.131515,0.077367,0.213363,0.0,0.0,0.06834,...,0.077181,0.050691,0.059211,0.011538,0.026743,44460.0,45185.0,0.318111,0.283897,0.337271
75%,670.0,74822.71,-0.13138,8.885194,56.548763,0.097046,7.268414,0.0,0.0,0.085911,...,0.094057,0.060364,0.071983,0.017857,0.035912,74830.0,75775.0,0.349491,0.318875,0.374579
max,34350.0,3815983.0,2.608,11.999968,196.7175,0.357143,457.297097,1.0,1.0,0.307229,...,0.416603,0.6,0.188525,0.232877,0.321429,4051510.0,4083510.0,0.676471,0.7794,0.76


### GDPC Encodings (25)

In [None]:
with open("/content/drive/MyDrive/protein_props/features/gdpc_encodings.json", 'r') as file:
    data = json.load(file)
gpdc_encodings = pd.DataFrame(data).transpose()

In [None]:
gpdc_encodings.describe()

Unnamed: 0,aliphatic_aliphatic,aliphatic_positive,aliphatic_negative,aliphatic_uncharged,aliphatic_aromatic,positive_aliphatic,positive_positive,positive_negative,positive_uncharged,positive_aromatic,...,uncharged_aliphatic,uncharged_positive,uncharged_negative,uncharged_uncharged,uncharged_aromatic,aromatic_aliphatic,aromatic_positive,aromatic_negative,aromatic_uncharged,aromatic_aromatic
count,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,...,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0
mean,0.138621,0.049996,0.063575,0.109844,0.029314,0.049012,0.023822,0.02497,0.041089,0.010577,...,0.108275,0.040454,0.05469,0.095738,0.023414,0.028625,0.010876,0.012984,0.023918,0.007841
std,0.049362,0.01598,0.020858,0.024028,0.015646,0.015561,0.015797,0.012335,0.017442,0.006715,...,0.022742,0.015806,0.018008,0.046529,0.011503,0.015752,0.00719,0.009044,0.01144,0.00782
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.106515,0.04043,0.050812,0.095385,0.018971,0.03967,0.014085,0.017284,0.030691,0.00625,...,0.094513,0.030986,0.043796,0.069182,0.015789,0.018018,0.006211,0.007619,0.016447,0.002507
50%,0.132626,0.049491,0.06262,0.109375,0.02725,0.048512,0.02115,0.024011,0.03871,0.00978,...,0.107817,0.038765,0.05364,0.087875,0.0224,0.026144,0.00994,0.01203,0.023107,0.006024
75%,0.164589,0.058824,0.074766,0.123457,0.037037,0.057592,0.029817,0.0311,0.047957,0.014045,...,0.121429,0.04785,0.064103,0.112468,0.029787,0.036254,0.014493,0.017073,0.030485,0.01087
max,0.529936,0.208333,0.444444,0.5,0.241935,0.2,0.375,0.222222,0.333333,0.066667,...,0.333333,0.666667,0.333333,0.987342,0.170213,0.2125,0.066667,0.2125,0.159785,0.125


In [None]:
gpdc_encodings.index

Index(['A0A087X1C5', 'A0A0B4J2F0', 'A0A0B4J2F2', 'A0A0C5B5G6', 'A0A0K2S4Q6',
       'A0A0U1RRE5', 'A0A1B0GTW7', 'A0AV02', 'A0AV96', 'A0AVF1',
       ...
       'Q9UF83', 'Q9UFV3', 'Q9UHT4', 'Q9UHU1', 'Q9UI25', 'Q9UI54', 'Q9UI72',
       'Q9Y3F1', 'Q9Y6C7', 'Q9Y6Z2'],
      dtype='object', length=20434)

### PPI properties including network properties (11)

In [None]:
ppi = pd.read_json("/content/drive/MyDrive/protein_props/features/ppi.json").transpose()
ppi_network = pd.read_csv("/content/drive/MyDrive/protein_props/features/ppi_network_properties.csv")
ppi_network.index = ppi_network['Unnamed: 0']
ppi_network.drop(columns = ['Unnamed: 0'], inplace = True)

ppi = pd.concat([ppi, ppi_network], axis = 1)
ppi.describe()

Unnamed: 0,binary_count,binary_experimental_count,xeno_count,xeno_experimental_count,degree_binary,degree_xeno,degree_all,avg_degree_nbr_binary,avg_degree_nbr_xeno,avg_degree_nbr_all,strongly_connected_component_sizes_all
count,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0
mean,7.015171,25.981991,0.154497,0.564109,7.873006,0.0,7.873006,48.557789,0.0,48.557789,1.732358
std,22.13183,88.840284,0.721995,3.744942,23.033573,0.0,23.033573,77.659993,0.0,77.659993,0.728205
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,3.0,0.0,0.0,1.0,0.0,1.0,14.666667,0.0,14.666667,2.0
75%,5.0,19.0,0.0,0.0,6.0,0.0,6.0,72.0,0.0,72.0,2.0
max,854.0,6331.0,21.0,341.0,822.0,0.0,822.0,822.0,0.0,822.0,8.0


In [None]:
ppi.index

Index(['P24821', 'Q96PT4', 'P62879', 'Q01664', 'Q9GZK3', 'Q9NSA3', 'Q8WVI7',
       'Q6NUQ4', 'Q5T215', 'Q7RTY9',
       ...
       'Q03933', 'Q6ZNE5', 'Q8N684', 'P0C0E4', 'Q6S5H5', 'O75461', 'Q92911',
       'Q9BQK8', 'Q99685', 'Q9ULZ9'],
      dtype='object', length=20434)

### PTM Counts and Glycolisation counts (11)

In [None]:
glycolisation = pd.read_csv("/content/drive/MyDrive/protein_props/features/glycosylation.csv")
glycolisation.index = glycolisation['Unnamed: 0']
glycolisation.drop(columns = ['Unnamed: 0'], inplace = True)

ptm = pd.read_csv("/content/drive/MyDrive/protein_props/features/PTM_counts.csv")
ptm.index = ptm["Unnamed: 0"]
ptm.drop(columns = ['Unnamed: 0'], inplace = True)

ptm_counts = pd.concat([ptm, glycolisation], axis = 1)
ptm_counts.describe()

Unnamed: 0,Glycosylation,Cross-link,Modified residue,Signal,Disulfide bond,O-linked,N-linked,C-linked,N-beta-linked,S-linked,O-alpha-linked
count,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0
mean,0.887051,0.335421,2.700548,0.176764,0.899432,0.092297,0.789762,0.003866,0.000783,0.000147,0.000196
std,4.829675,1.917996,6.874887,0.381478,4.783239,3.790505,2.65723,0.164466,0.039566,0.012116,0.01399
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,547.0,58.0,282.0,1.0,159.0,519.0,102.0,15.0,4.0,1.0,1.0


In [None]:
ptm_counts.index

Index(['A0A087X1C5', 'A0A0B4J2F0', 'A0A0B4J2F2', 'A0A0C5B5G6', 'A0A0K2S4Q6',
       'A0A0U1RRE5', 'A0A1B0GTW7', 'A0AV02', 'A0AV96', 'A0AVF1',
       ...
       'Q9UF83', 'Q9UFV3', 'Q9UHT4', 'Q9UHU1', 'Q9UI25', 'Q9UI54', 'Q9UI72',
       'Q9Y3F1', 'Q9Y6C7', 'Q9Y6Z2'],
      dtype='object', name='Unnamed: 0', length=20434)

### Subcellular locations (50)

In [None]:
with open("/content/drive/MyDrive/protein_props/features/subcellular_locations2.json", 'r') as file:
    data = json.load(file)
unique_groups = set()
for entry in data.values():
    if "general" in entry:
        for general_entry in entry["general"]:
            if "group" in general_entry: unique_groups.add(general_entry["group"])

unique_groups_list = list(unique_groups)

rows = []
for protein_id in PCP_properties.index:
    row = {group: 0 for group in unique_groups_list}
    if protein_id in data:
        for entry in data[protein_id].get("general", []):
            if "group" in entry and entry["group"] in unique_groups:
                row[entry["group"]] = 1
    row["protein_id"] = protein_id
    rows.append(row)

subcellular_data = pd.DataFrame(rows).set_index("protein_id")
subcellular_data.describe()

Unnamed: 0,nucleus,mitochondrion,lateral cell membrane,cleavage furrow,rough endoplasmic reticulum,target cell membrane,vacuole,cell junction,zymogen granule,vacuole membrane,...,lipid droplet,cell projection,early endosome,chromosome,cytoplasmic vesicle,endomembrane system,recycling endosome,peroxisome,endosome,golgo apparatus
count,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,...,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0
mean,0.264804,0.059558,0.000734,0.001909,0.001272,4.9e-05,4.9e-05,0.017618,0.000489,0.000245,...,0.002985,0.047372,0.01292,0.027258,0.03318,0.003964,0.006411,0.004894,0.013262,0.044631
std,0.441239,0.236671,0.027084,0.043647,0.035649,0.006996,0.006996,0.13156,0.022117,0.015641,...,0.054557,0.212439,0.112931,0.16284,0.179111,0.062837,0.079813,0.069786,0.114398,0.206498
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
subcellular_data.index

Index(['A0A087X1C5', 'A0A0B4J2F0', 'A0A0B4J2F2', 'A0A0C5B5G6', 'A0A0K2S4Q6',
       'A0A0U1RRE5', 'A0A1B0GTW7', 'A0AV02', 'A0AV96', 'A0AVF1',
       ...
       'Q9UF83', 'Q9UFV3', 'Q9UHT4', 'Q9UHU1', 'Q9UI25', 'Q9UI54', 'Q9UI72',
       'Q9Y3F1', 'Q9Y6C7', 'Q9Y6Z2'],
      dtype='object', name='protein_id', length=20434)

### Top 20 Domains (20)

In [None]:
domains = pd.read_csv("/content/drive/MyDrive/protein_props/features/data_top20_updated.csv")
domains.index = domains['Unnamed: 0']
domains.drop(columns = ['Unnamed: 0'], inplace = True)
domains.describe()

Unnamed: 0,rrm,acidic residues,ph,krab,pdz,btb,nuclear localization signal,fibronectin type-iii,disordered,ig-like v-type,ef-hand,sh3,ig-like,pro residues,protein kinase,ig-like c2-type,basic and acidic residues,basic residues,egf-like,polar residues
count,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0
mean,0.019869,0.051385,0.015709,0.018352,0.013262,0.008222,0.027307,0.041548,1.296075,0.009445,0.030195,0.014926,0.032788,0.134775,0.024665,0.047421,0.421846,0.033131,0.038808,0.587452
std,0.211491,0.26786,0.143363,0.136393,0.213273,0.095568,0.193463,1.073926,2.263518,0.114777,0.32501,0.167087,1.15432,0.600772,0.159768,0.628413,1.224349,0.290856,0.585984,1.542403
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,6.0,7.0,5.0,2.0,13.0,2.0,5.0,132.0,111.0,5.0,17.0,6.0,152.0,16.0,2.0,44.0,46.0,21.0,24.0,59.0


In [None]:
domains.index

Index(['P24821', 'Q96PT4', 'P62879', 'Q01664', 'Q9GZK3', 'Q9NSA3', 'Q8WVI7',
       'Q6NUQ4', 'Q5T215', 'Q7RTY9',
       ...
       'Q03933', 'Q6ZNE5', 'Q8N684', 'P0C0E4', 'Q6S5H5', 'O75461', 'Q92911',
       'Q9BQK8', 'Q99685', 'Q9ULZ9'],
      dtype='object', name='Unnamed: 0', length=20434)

### Flexibility Sequence Properties (14)

In [None]:
flexibility = pd.read_csv("/content/drive/MyDrive/protein_props/features/flexibility_properties.csv")
flexibility.index = flexibility['Unnamed: 0']
flexibility.drop(columns = ['Unnamed: 0'], inplace = True)

flexibility.describe()

Unnamed: 0,Mean,Mode,Min,Max,Variance,Median,Standard Deviation,Range,Min gap,Max gap,Average gap,Min 2-hop gap,Max 2-hop gap,Average 2-hop gap
count,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0,20434.0
mean,1.00123,0.979936,0.93632,1.070449,0.000725,1.001221,0.02678,0.134128,-0.068259,0.067175,2.3e-05,-0.067454,0.067869,4.5e-05
std,0.018425,0.032271,0.01778,0.020998,0.000154,0.018994,0.002829,0.016069,0.007706,0.00735,0.000262,0.008893,0.009057,0.000495
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.091821,-0.002548,-0.004178,-0.099524,0.0,-0.008369
25%,0.996485,0.960667,0.929429,1.062893,0.000631,0.996167,0.025126,0.124551,-0.073286,0.062893,-4.9e-05,-0.073321,0.062405,-8.4e-05
50%,1.001614,0.977667,0.936012,1.071048,0.000707,1.001714,0.026596,0.135036,-0.068976,0.067643,8e-06,-0.067667,0.068143,1.6e-05
75%,1.007368,0.998583,0.942571,1.079107,0.000799,1.008012,0.02827,0.144774,-0.064098,0.071988,7.8e-05,-0.062012,0.073762,0.00014
max,1.055859,1.102,1.032952,1.102,0.002813,1.060298,0.053042,0.187321,0.006607,0.095595,0.016839,0.033679,0.106393,0.033679


In [None]:
flexibility.index

Index(['A0A087X1C5', 'A0A0B4J2F0', 'A0A0B4J2F2', 'A0A0C5B5G6', 'A0A0K2S4Q6',
       'A0A0U1RRE5', 'A0A1B0GTW7', 'A0AV02', 'A0AV96', 'A0AVF1',
       ...
       'Q9UF83', 'Q9UFV3', 'Q9UHT4', 'Q9UHU1', 'Q9UI25', 'Q9UI54', 'Q9UI72',
       'Q9Y3F1', 'Q9Y6C7', 'Q9Y6Z2'],
      dtype='object', name='Unnamed: 0', length=20434)

### Latent Values based on Autoencoder (20)

In [None]:
latent_data = pd.read_csv("/content/drive/MyDrive/protein_props/features/latent_values.csv").transpose()
latent_data.columns = [f"Latent_Value_{i+1}" for i in latent_data.columns]
latent_data.describe()

Unnamed: 0,Latent_Value_1,Latent_Value_2,Latent_Value_3,Latent_Value_4,Latent_Value_5,Latent_Value_6,Latent_Value_7,Latent_Value_8,Latent_Value_9,Latent_Value_10,Latent_Value_11,Latent_Value_12,Latent_Value_13,Latent_Value_14,Latent_Value_15,Latent_Value_16,Latent_Value_17,Latent_Value_18,Latent_Value_19,Latent_Value_20
count,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0
mean,-18.548981,-43.189425,44.344207,-10.203095,46.527308,9.329222,-12.469526,-21.468993,4.116744,39.570006,13.893222,-59.296527,-21.11976,-11.754344,-14.021903,11.369287,-38.191789,-65.911123,20.612659,-20.375037
std,1.073874,2.477214,2.567414,0.687608,2.704706,0.622698,0.786841,1.321244,0.451873,2.285824,0.885935,3.392822,1.299799,0.762044,0.862525,0.750094,2.207514,3.778273,1.249143,1.204186
min,-26.003214,-60.713062,22.67439,-14.44237,24.50558,4.332309,-17.551315,-30.17763,1.639067,19.859274,6.069322,-83.4129,-29.784056,-16.586477,-19.726072,5.56581,-53.62563,-92.69067,10.444728,-28.690975
25%,-19.185312,-44.62215,42.898323,-10.625556,44.9905,8.948679,-12.945388,-22.270542,3.816242,38.26978,13.372448,-61.264755,-21.902708,-12.22035,-14.546921,10.909314,-39.47019,-68.10032,19.891426,-21.091385
50%,-18.567122,-43.23134,44.377804,-10.210994,46.572582,9.339145,-12.478144,-21.49518,4.120439,39.610523,13.891561,-59.335037,-21.125488,-11.764125,-14.030611,11.374349,-38.22998,-65.99886,20.619267,-20.389463
75%,-17.931555,-41.805725,45.842484,-9.784028,48.127113,9.717904,-12.00793,-20.690805,4.415192,40.90257,14.448854,-57.377953,-20.357666,-11.303124,-13.518088,11.833316,-36.94591,-63.77441,21.348589,-19.682299
max,-9.688256,-22.169565,62.373188,-5.027713,65.472336,13.056457,-6.163255,-10.87966,6.626209,55.670376,19.680548,-30.717695,-11.023568,-5.909817,-7.196007,16.240753,-19.311134,-34.13452,29.01001,-10.412269


In [None]:
latent_data.index

Index(['A0A087X1C5', 'A0A0B4J2F0', 'A0A0B4J2F2', 'A0A0C5B5G6', 'A0A0K2S4Q6',
       'A0A0U1RRE5', 'A0A1B0GTW7', 'A0AV02', 'A0AV96', 'A0AVF1',
       ...
       'Q9UF83', 'Q9UFV3', 'Q9UHT4', 'Q9UHU1', 'Q9UI25', 'Q9UI54', 'Q9UI72',
       'Q9Y3F1', 'Q9Y6C7', 'Q9Y6Z2'],
      dtype='object', length=20273)

### Final Data

In [None]:
final_data = pd.concat([PCP_properties,gpdc_encodings, ptm_counts, ppi, subcellular_data, domains, flexibility, latent_data], axis = 1).dropna()
final_data.describe()


Unnamed: 0,Sequence Length,Molecular Weight,GRAVY,Isoelectric Point,Instability Index,Aromaticity,Charge at 7,is_druggable,is_approved_druggable,Amino Acid Percent A,...,Latent_Value_11,Latent_Value_12,Latent_Value_13,Latent_Value_14,Latent_Value_15,Latent_Value_16,Latent_Value_17,Latent_Value_18,Latent_Value_19,Latent_Value_20
count,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,...,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0,20273.0
mean,526.63148,58688.065646,-0.34305,7.340538,49.419916,0.0793,-1.206217,0.163962,0.130025,0.072119,...,13.893222,-59.296527,-21.11976,-11.754344,-14.021903,11.369287,-38.191789,-65.911123,20.612659,-20.375037
std,423.918637,47109.249519,0.431795,1.754249,13.396249,0.03021,19.022544,0.37025,0.336339,0.027805,...,0.885935,3.392822,1.299799,0.762044,0.862525,0.750094,2.207514,3.778273,1.249143,1.204186
min,2.0,260.2869,-3.4325,4.050028,-42.94,0.0,-288.111306,0.0,0.0,0.0,...,6.069322,-83.4129,-29.784056,-16.586477,-19.726072,5.56581,-53.62563,-92.69067,10.444728,-28.690975
25%,248.0,27598.4564,-0.604545,5.809081,40.875362,0.058824,-7.801158,0.0,0.0,0.053953,...,13.372448,-61.264755,-21.902708,-12.22035,-14.546921,10.909314,-39.47019,-68.10032,19.891426,-21.091385
50%,411.0,45879.0027,-0.363146,7.094779,48.133675,0.07745,0.309019,0.0,0.0,0.068376,...,13.891561,-59.335037,-21.125488,-11.764125,-14.030611,11.374349,-38.22998,-65.99886,20.619267,-20.389463
75%,662.0,73813.6435,-0.130112,8.890851,56.553822,0.097222,7.336941,0.0,0.0,0.085981,...,14.448854,-57.377953,-20.357666,-11.303124,-13.518088,11.833316,-36.94591,-63.77441,21.348589,-19.682299
max,2997.0,340256.9948,2.608,11.999968,196.7175,0.357143,409.140542,1.0,1.0,0.307229,...,19.680548,-30.717695,-11.023568,-5.909817,-7.196007,16.240753,-19.311134,-34.13452,29.01001,-10.412269


In [None]:
features_list = final_data.columns
features_list = features_list.drop(['is_druggable','is_approved_druggable'])
features_list = list(features_list)
print(features_list)
print(len(features_list))

['Sequence Length', 'Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7', 'Amino Acid Percent A', 'Amino Acid Percent C', 'Amino Acid Percent D', 'Amino Acid Percent E', 'Amino Acid Percent F', 'Amino Acid Percent G', 'Amino Acid Percent H', 'Amino Acid Percent I', 'Amino Acid Percent K', 'Amino Acid Percent L', 'Amino Acid Percent M', 'Amino Acid Percent N', 'Amino Acid Percent P', 'Amino Acid Percent Q', 'Amino Acid Percent R', 'Amino Acid Percent S', 'Amino Acid Percent T', 'Amino Acid Percent V', 'Amino Acid Percent W', 'Amino Acid Percent Y', 'Molar Extinction Coefficient 1', 'Molar Extinction Coefficient 2', 'Secondary Structure helix', 'Secondary Structure turn', 'Secondary Structure sheet', 'aliphatic_aliphatic', 'aliphatic_positive', 'aliphatic_negative', 'aliphatic_uncharged', 'aliphatic_aromatic', 'positive_aliphatic', 'positive_positive', 'positive_negative', 'positive_uncharged', 'positive_aromatic', 'negative_aliphatic', 'neg

In [None]:
#for splitting of data
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN, SMOTE

def data_splitting(x_sample, y_sample):
  druggable_indices = (y_sample == 1)  # Assuming 1 represents druggable
  non_druggable_indices = (y_sample == 0)  # Assuming 0 represents non-druggable

  druggable_X = x_sample[druggable_indices]
  druggable_y = y_sample[druggable_indices]

  non_druggable_X = x_sample[non_druggable_indices]
  non_druggable_y = y_sample[non_druggable_indices]

  class_size = 600
  druggable_X_remaining, druggable_X_test, druggable_y_remaining, druggable_y_test = train_test_split(druggable_X, druggable_y, test_size=class_size, random_state=123)
  non_druggable_X_remaining, non_druggable_X_test, non_druggable_y_remaining, non_druggable_y_test = train_test_split(non_druggable_X, non_druggable_y, test_size= class_size, random_state=123)

  X_test = pd.concat((druggable_X_test, non_druggable_X_test))
  y_test = pd.concat((druggable_y_test, non_druggable_y_test))

  X_train = pd.concat((druggable_X_remaining, non_druggable_X_remaining))
  y_train = pd.concat((druggable_y_remaining, non_druggable_y_remaining))

  # Shuffle the training and testing data
  X_train, y_train = shuffle(X_train, y_train, random_state=123)
  X_test, y_test = shuffle(X_test, y_test, random_state=123)

  #scaling of data
  # sc = StandardScaler()
  # X_train = sc.fit_transform(X_train)
  # X_test = sc.fit_transform(X_test)
  ada = ADASYN(random_state=42)
  # smt = SMOTE(random_state=42)

  X_train, y_train = ada.fit_resample(X_train, y_train)
  return X_train, X_test, y_train, y_test


In [None]:
X_train, X_test, y_train, y_test = data_splitting(final_data.drop(columns = ['is_approved_druggable','is_druggable']), final_data['is_druggable'])

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((32685, 183), (1200, 183), (32685,), (1200,))

### Artificial Neural Network

In [None]:
# set seed
import tensorflow as tf
tf.random.set_seed(42)
import numpy as np
np.random.seed(42)


In [None]:
tf.__version__, np.__version__, pd.__version__

('2.15.0', '1.25.2', '2.0.3')

In [None]:
### NEURAL NETWORK

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,roc_curve, roc_auc_score
import matplotlib.pyplot as plt

model = tf.keras.Sequential([
    layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01), input_shape=(X_train.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.15),

    # layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    # layers.BatchNormalization(),
    # layers.Dropout(0.2),

    # layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    # layers.BatchNormalization(),
    # layers.Dropout(0.2),

    # layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    # layers.BatchNormalization(),
    # layers.Dropout(0.2),

    # layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    # layers.BatchNormalization(),
    # layers.Dropout(0.2),

    layers.Dense(1, activation='sigmoid')
])

# Compile model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)]
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

history = model.fit(X_train, y_train, validation_data = (X_test,y_test), epochs = 50, callbacks=callbacks)

print("Model Summary:")
model.summary()

print("Model Evaluation: Train Data")
y_pred = model.predict(X_train)
y_pred = (y_pred >= 0.5).astype(int)
metrics = {
  'accuracy' : accuracy_score(y_train,y_pred),
  'precision_weighted' : precision_score(y_train,y_pred,average = 'weighted'),
  'recall_weighted' : recall_score(y_train,y_pred,average = 'weighted'),
  'f1_weighted' : f1_score(y_train,y_pred,average = 'weighted'),
}
print(metrics)
print("Model Evaluation: Test Data")
y_pred = model.predict(X_test)
y_pred = (y_pred >= 0.5).astype(int)
metrics = {
  'accuracy' : accuracy_score(y_test,y_pred),
  'precision_weighted' : precision_score(y_test,y_pred,average = 'weighted'),
  'recall_weighted' : recall_score(y_test,y_pred,average = 'weighted'),
  'f1_weighted' : f1_score(y_test,y_pred,average = 'weighted'),
}
print(metrics)

fig,ax = plt.subplots(figsize = (40,20), ncols = 2)
ax[0].plot(history.history['accuracy'],color = 'black', label = 'Training Accuracy')
ax[0].plot(history.history['val_accuracy'],color = 'green', label = 'Validation Accuracy')
ax[0].set_ylabel('Accuracy')
ax[0].set_xlabel('Epochs')
ax[0].set_title('Accuracy Variation')
ax[0].legend()

ax[1].plot(history.history['loss'],color = 'black', label = 'Training Loss')
ax[1].plot(history.history['val_loss'],color = 'green', label = 'Validation Loss')
ax[1].set_ylabel('Loss')
ax[1].set_xlabel('Epochs')
ax[1].set_title('Loss Variation')
ax[1].legend()
plt.show()


In [None]:
# # save model
# model.save('/content/drive/MyDrive/protein_props/models/model1_ann.h5')