In [None]:
# Mount google drive at /content/drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Setting seeds for reproducibility
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import json
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import ADASYN, SMOTE
import pandas as pd
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
data_file_path = "/content/drive/MyDrive/protein_props/features/protein_props.json"
druggable_proteins_file_path = "/content/drive/MyDrive/protein_props/NEW_WORK/druggable_proteins.txt"
investigational_proteins_file_path = "/content/drive/MyDrive/protein_props/NEW_WORK/investigational_proteins.txt"

with open(data_file_path, 'r') as f:
    protein_data = json.load(f)

print("Total number of uniprot human verified proteins:", len(protein_data))

# Extracting list of druggable and approved druggable proteins
with open(druggable_proteins_file_path, 'r') as f:
    approved_druggable_proteins = f.read().splitlines()

with open(investigational_proteins_file_path, 'r') as f:
    investigational_proteins = f.read().splitlines()

druggable_proteins = approved_druggable_proteins + investigational_proteins

print("Number of druggable proteins:", len(druggable_proteins))
print("Number of druggable approved proteins:", len(approved_druggable_proteins))


# Fetching feature data for all proteins
properties = (pd.read_json("/content/drive/MyDrive/protein_props/features/protein_props.json")).transpose()
is_druggable = [1 if i in druggable_proteins else 0 for i in properties.index]
is_approved_druggable = [1 if i in approved_druggable_proteins else 0 for i in properties.index]

properties["is_druggable"] = is_druggable
properties["is_approved_druggable"] = is_approved_druggable

PCP_properties = properties.copy()
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
amino_acid_percent = {i:[] for i in amino_acids}
for i in PCP_properties['Amino Acid Percent']:
  for aa in amino_acids:
    amino_acid_percent[aa].append(i[aa])
for aa in amino_acids:
  PCP_properties = pd.concat([PCP_properties, pd.Series(amino_acid_percent[aa], index = PCP_properties.index, name = f"Amino Acid Percent {aa}")], axis = 1)

PCP_properties[f"Molar Extinction Coefficient 1"] = pd.Series([x[0] for x in PCP_properties['Molar Extinction Coefficient']], index = PCP_properties.index)
PCP_properties[f"Molar Extinction Coefficient 2"] = pd.Series([x[1] for x in PCP_properties['Molar Extinction Coefficient']], index = PCP_properties.index)

PCP_properties[f"Secondary Structure helix"] = pd.Series([x[0] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)
PCP_properties[f"Secondary Structure turn"] = pd.Series([x[1] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)
PCP_properties[f"Secondary Structure sheet"] = pd.Series([x[2] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)

PCP_properties.drop(columns = ['Amino Acid Count','Amino Acid Percent',"Molar Extinction Coefficient","Flexibility","Secondary Structure",'Sequence'], inplace = True)
PCP_properties['Sequence Length'] = PCP_properties['Sequence Length'].astype(int)
PCP_properties[['Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7']] = PCP_properties[['Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7']].astype(float)

with open("/content/drive/MyDrive/protein_props/features/gdpc_encodings.json", 'r') as file:
    data = json.load(file)
gpdc_encodings = pd.DataFrame(data).transpose()

ppi = pd.read_json("/content/drive/MyDrive/protein_props/features/ppi.json").transpose()
ppi_network = pd.read_csv("/content/drive/MyDrive/protein_props/features/ppi_network_properties.csv")
ppi_network.index = ppi_network['Unnamed: 0']
ppi_network.drop(columns = ['Unnamed: 0'], inplace = True)
ppi = pd.concat([ppi, ppi_network], axis = 1)

glycolisation = pd.read_csv("/content/drive/MyDrive/protein_props/features/glycosylation.csv")
glycolisation.index = glycolisation['Unnamed: 0']
glycolisation.drop(columns = ['Unnamed: 0'], inplace = True)
ptm = pd.read_csv("/content/drive/MyDrive/protein_props/features/PTM_counts.csv")
ptm.index = ptm["Unnamed: 0"]
ptm.drop(columns = ['Unnamed: 0'], inplace = True)
ptm_counts = pd.concat([ptm, glycolisation], axis = 1)

with open("/content/drive/MyDrive/protein_props/features/subcellular_locations2.json", 'r') as file:
    data = json.load(file)
unique_groups = set()
for entry in data.values():
    if "general" in entry:
        for general_entry in entry["general"]:
            if "group" in general_entry: unique_groups.add(general_entry["group"])

unique_groups_list = list(unique_groups)

rows = []
for protein_id in PCP_properties.index:
    row = {group: 0 for group in unique_groups_list}
    if protein_id in data:
        for entry in data[protein_id].get("general", []):
            if "group" in entry and entry["group"] in unique_groups:
                row[entry["group"]] = 1
    row["protein_id"] = protein_id
    rows.append(row)

subcellular_data = pd.DataFrame(rows).set_index("protein_id")

domains = pd.read_csv("/content/drive/MyDrive/protein_props/features/data_top20_updated.csv")
domains.index = domains['Unnamed: 0']
domains.drop(columns = ['Unnamed: 0'], inplace = True)

flexibility = pd.read_csv("/content/drive/MyDrive/protein_props/features/flexibility_properties.csv")
flexibility.index = flexibility['Unnamed: 0']
flexibility.drop(columns = ['Unnamed: 0'], inplace = True)

latent_data = pd.read_csv("/content/drive/MyDrive/protein_props/features/latent_values.csv").transpose()
latent_data.columns = [f"Latent_Value_{i+1}" for i in latent_data.columns]
final_data = pd.concat([PCP_properties,gpdc_encodings, ptm_counts, ppi, subcellular_data, domains, flexibility, latent_data], axis = 1).dropna()
features_list = final_data.columns
features_list = features_list.drop(['is_druggable','is_approved_druggable'])
features_list = list(features_list)
print(features_list)
print(len(features_list))


Total number of uniprot human verified proteins: 20434
Number of druggable proteins: 2915
Number of druggable approved proteins: 2233
['Sequence Length', 'Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7', 'Amino Acid Percent A', 'Amino Acid Percent C', 'Amino Acid Percent D', 'Amino Acid Percent E', 'Amino Acid Percent F', 'Amino Acid Percent G', 'Amino Acid Percent H', 'Amino Acid Percent I', 'Amino Acid Percent K', 'Amino Acid Percent L', 'Amino Acid Percent M', 'Amino Acid Percent N', 'Amino Acid Percent P', 'Amino Acid Percent Q', 'Amino Acid Percent R', 'Amino Acid Percent S', 'Amino Acid Percent T', 'Amino Acid Percent V', 'Amino Acid Percent W', 'Amino Acid Percent Y', 'Molar Extinction Coefficient 1', 'Molar Extinction Coefficient 2', 'Secondary Structure helix', 'Secondary Structure turn', 'Secondary Structure sheet', 'aliphatic_aliphatic', 'aliphatic_positive', 'aliphatic_negative', 'aliphatic_uncharged', 'aliphatic_aromatic',

In [None]:
# with open("newly_approved.csv") as f:
#   newly_approved = f.read().splitlines()
# len(newly_approved)

# # convert into fasta format
# string_info = ""
# for protein in newly_approved:
#   string_info += f">{protein}\n"
#   string_info += f"{properties.loc[protein]['Sequence']}\n"

# with open("newly_approved.fasta", "w") as f:
#   f.write(string_info)

In [None]:
len(approved_druggable_proteins), len(investigational_proteins)

(2233, 682)

In [None]:
# Train Test Splitting
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import ADASYN, SMOTE

def get_data(x_sample, y_sample):
  return np.array(x_sample), np.array(y_sample)

def data_splitting(x_sample, y_sample, mode="default", scaler="none", class_size=600):
  druggable_indices = (y_sample == 1)  # Assuming 1 represents druggable
  non_druggable_indices = (y_sample == 0)  # Assuming 0 represents non-druggable

  druggable_X = x_sample[druggable_indices]
  druggable_y = y_sample[druggable_indices]

  non_druggable_X = x_sample[non_druggable_indices]
  non_druggable_y = y_sample[non_druggable_indices]

  druggable_X_remaining, druggable_X_test, druggable_y_remaining, druggable_y_test = train_test_split(druggable_X, druggable_y, test_size=class_size, random_state=123)
  non_druggable_X_remaining, non_druggable_X_test, non_druggable_y_remaining, non_druggable_y_test = train_test_split(non_druggable_X, non_druggable_y, test_size= class_size, random_state=123)

  X_test = pd.concat((druggable_X_test, non_druggable_X_test))
  y_test = pd.concat((druggable_y_test, non_druggable_y_test))
  X_train = pd.concat((druggable_X_remaining, non_druggable_X_remaining))
  y_train = pd.concat((druggable_y_remaining, non_druggable_y_remaining))
  X_train, y_train = shuffle(X_train, y_train, random_state=123)
  if mode == "default":
    pass
  elif mode == "adasyn":
    ada = ADASYN(random_state=42)
    X_train, y_train = ada.fit_resample(X_train, y_train)
  elif mode == "smote":
    smt = SMOTE(random_state=42)
    X_train, y_train = smt.fit_resample(X_train, y_train)

  if scaler == "std":
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
  elif scaler == "minmax":
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
  elif scaler == "none":
    pass

  return X_train, X_test, y_train, y_test


In [None]:
# rem-new-data is to extract only those proteins which are either approved druggable or non-druggable
# i.e., it excludes proteins which are non-approved but druggable
new_data = final_data.copy()
new_data['new_column'] = new_data['is_druggable'] + new_data['is_approved_druggable']
print(np.bincount(new_data['new_column']))
rem_new_data = new_data[new_data['new_column'] != 1]
rem_new_data.shape, np.bincount(rem_new_data['new_column'])

[17377   677  2219]


((19596, 186), array([17377,     0,  2219]))

### Predicting Druggability Index (DI) using Partition Method

In [None]:
X, y = rem_new_data[features_list], np.array(rem_new_data["is_approved_druggable"])
X.shape, y.shape

((19596, 183), (19596,))

In [None]:
np.bincount(y)

array([17377,  2219])

In [None]:
X_druggable = X[y == 1]
X_non_druggable = X[y == 0]

X_non_druggable_partitions = np.array_split(X_non_druggable, round(len(X_non_druggable)/len(X_druggable)))
print(f"Splitting into {len(X_non_druggable_partitions)} partitions")
print("Sizes of partitions")
for i, partition in enumerate(X_non_druggable_partitions):
  print(f"Partition {i}: {len(partition)}")

Splitting into 8 partitions
Sizes of partitions
Partition 0: 2173
Partition 1: 2172
Partition 2: 2172
Partition 3: 2172
Partition 4: 2172
Partition 5: 2172
Partition 6: 2172
Partition 7: 2172


  return bound(*args, **kwds)


In [None]:
import xgboost as xgb

xgb_models = []
for partition in X_non_druggable_partitions:
  X_combined = np.concatenate((np.array(X_druggable), np.array(partition)))
  y_combined = np.concatenate((np.ones(len(X_druggable)), np.zeros(len(partition))))
  xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
  xgb_model.fit(X_combined, y_combined)
  xgb_models.append(xgb_model)


### Druggability Index for Non Approved Druggable Proteins

In [None]:
non_approved_druggable = new_data[new_data["new_column"] == 1]
non_approved_druggable.shape

(677, 186)

In [None]:
X_test, y_test = get_data(non_approved_druggable[features_list], non_approved_druggable["is_approved_druggable"])
X_test.shape, y_test.shape

((677, 183), (677,))

In [None]:
protein_names = non_approved_druggable.index
len(protein_names)

677

In [None]:
predictions = []
probabilities = []
for model in xgb_models:
  predictions.append(model.predict(X_test))
  probabilities.append(model.predict_proba(X_test)[:,1])

predictions = np.array(predictions)
probabilities = np.array(probabilities)

predictions.shape, probabilities.shape

((8, 677), (8, 677))

In [None]:
predictions = np.mean(predictions, axis=0)
predictions = np.round(predictions)
predictions.shape

(677,)

In [None]:
mean_probabilities = np.mean(probabilities, axis=0)
mean_probabilities.shape

(677,)

In [None]:
data = {
    "Protein": protein_names
}
for i,probs in enumerate(probabilities):
  data[f"Probability_Partition_{i+1}"] = probs
data["Mean_Probability"] = mean_probabilities
data["Majority_Prediction"] = predictions

df = pd.DataFrame(data)
df.set_index("Protein", inplace=True)
df.head()

Unnamed: 0_level_0,Probability_Partition_1,Probability_Partition_2,Probability_Partition_3,Probability_Partition_4,Probability_Partition_5,Probability_Partition_6,Probability_Partition_7,Probability_Partition_8,Mean_Probability,Majority_Prediction
Protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A8MPY1,0.997942,0.987718,0.992988,0.995396,0.969573,0.884285,0.512737,0.989705,0.916293,1.0
O00182,0.071415,0.50756,0.047426,0.136873,0.598508,0.028446,0.393107,0.219679,0.250377,0.0
O00187,0.510667,0.894427,0.995067,0.971319,0.975857,0.98351,0.999856,0.999987,0.916336,1.0
O00303,0.367701,0.293249,0.762984,0.620618,0.210906,0.99796,0.999951,0.999252,0.656578,1.0
O00459,0.4275,0.111523,0.923128,0.334195,0.360499,0.897659,0.999425,0.999839,0.631721,0.0


In [None]:
df.to_csv("/content/drive/MyDrive/protein_props/NEW_WORK/XGB_DI_investigational.csv")

### Druggability Index for Non Druggable Train Set

In [None]:
protein_names_all, predictions_all, probabilities_all = [], None, None
for i, partition in enumerate(X_non_druggable_partitions):
  protein_names = partition.index
  predictions, probabilities = [], []
  for j, model in enumerate(xgb_models):
    if j != i:
      predictions.append(model.predict(np.array(partition)))
      probabilities.append(model.predict_proba(np.array(partition))[:,1])
  predictions, probabilities = np.array(predictions), np.array(probabilities)
  predictions = np.mean(predictions, axis=0)
  predictions = np.round(predictions)
  print(predictions.shape, probabilities.shape)

  protein_names_all.extend(protein_names)
  if predictions_all is None:
    predictions_all = predictions
    probabilities_all = probabilities
  else:
    predictions_all = np.concatenate((predictions_all, predictions))
    probabilities_all = np.concatenate((probabilities_all, probabilities), axis=1)

(2173,) (7, 2173)
(2172,) (7, 2172)
(2172,) (7, 2172)
(2172,) (7, 2172)
(2172,) (7, 2172)
(2172,) (7, 2172)
(2172,) (7, 2172)
(2172,) (7, 2172)


In [None]:
predictions_all = np.array(predictions_all)
probabilities_all = np.array(probabilities_all)
mean_probabilities_all = np.mean(probabilities_all, axis=0)
len(protein_names_all), predictions_all.shape, probabilities_all.shape, mean_probabilities_all.shape

(17377, (17377,), (7, 17377), (17377,))

In [None]:
data = {
    "Protein": protein_names_all
}
for i,probs in enumerate(probabilities_all):
  data[f"Probability_Partition_{i+1}"] = probs
data["Mean_Probability"] = mean_probabilities_all
data["Majority_Prediction"] = predictions_all

df = pd.DataFrame(data)
df.set_index("Protein", inplace=True)
df.head()

Unnamed: 0_level_0,Probability_Partition_1,Probability_Partition_2,Probability_Partition_3,Probability_Partition_4,Probability_Partition_5,Probability_Partition_6,Probability_Partition_7,Mean_Probability,Majority_Prediction
Protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A0A087X1C5,0.791163,0.95444,0.722492,0.816621,0.810045,0.952112,0.999797,0.86381,1.0
A0A0B4J2F0,0.590943,0.677374,0.956024,0.685269,0.174851,0.748101,0.93729,0.681408,1.0
A0A0B4J2F2,0.991516,0.913039,0.982178,0.980435,0.944048,0.937387,0.96961,0.959745,1.0
A0A0C5B5G6,0.538607,0.764433,0.880157,0.258897,0.165899,0.216751,0.680989,0.500819,1.0
A0A0K2S4Q6,0.643794,0.357663,0.585669,0.592733,0.013564,0.012757,0.014309,0.317212,0.0


In [None]:
1 - (sum(predictions_all)/len(predictions_all))

0.6784830523105254

In [None]:
df.to_csv("/content/drive/MyDrive/protein_props/NEW_WORK/XGB_DI_non_druggable.csv")

### Druggability Index for Druggable Train Set

In [None]:
protein_names = X_druggable.index
predictions, probabilities = [], []
for model in xgb_models:
  predictions.append(model.predict(np.array(X_druggable)))
  probabilities.append(model.predict_proba(np.array(X_druggable))[:,1])
predictions, probabilities = np.array(predictions), np.array(probabilities)
predictions = np.mean(predictions, axis=0)
predictions = np.round(predictions)
print(predictions.shape, probabilities.shape)
mean_probabilities = np.mean(probabilities, axis=0)
mean_probabilities.shape

(2219,) (8, 2219)


(2219,)

In [None]:
data = {
    "Protein": protein_names
}
for i,probs in enumerate(probabilities):
  data[f"Probability_Partition_{i+1}"] = probs
data["Mean_Probability"] = mean_probabilities
data["Majority_Prediction"] = predictions

df = pd.DataFrame(data)
df.set_index("Protein", inplace=True)
df.head()

Unnamed: 0_level_0,Probability_Partition_1,Probability_Partition_2,Probability_Partition_3,Probability_Partition_4,Probability_Partition_5,Probability_Partition_6,Probability_Partition_7,Probability_Partition_8,Mean_Probability,Majority_Prediction
Protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A5X5Y0,0.979762,0.996044,0.97038,0.984403,0.968371,0.986189,0.996193,0.999945,0.985161,1.0
O00141,0.990602,0.992262,0.99951,0.999903,0.998989,0.999877,0.999997,0.999999,0.997642,1.0
O00142,0.974878,0.935909,0.954329,0.92668,0.937883,0.899439,0.936555,0.988121,0.944224,1.0
O00180,0.95826,0.971161,0.977702,0.942896,0.976066,0.997179,0.999386,0.999992,0.97783,1.0
O00204,0.92076,0.870546,0.939341,0.943727,0.93025,0.908852,0.998772,0.999966,0.939027,1.0


In [None]:
df.to_csv("/content/drive/MyDrive/protein_props/NEW_WORK/XGB_DI_druggable.csv")