In [2]:
# Mount google drive at /content/drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Set seeds
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import json
import pandas as pd
np.random.seed(42)
tf.random.set_seed(42)

In [15]:
# Fetching PCP properties of druggable and non-druggable proteins
data_file_path = "/content/drive/MyDrive/protein_props/features/protein_props.json"
druggable_proteins_file_path = "/content/drive/MyDrive/protein_props/druggable_proteins.txt"
approved_druggable_proteins_file_path = "/content/drive/MyDrive/protein_props/approved_druggable_proteins.txt"

with open(data_file_path, 'r') as f:
    protein_data = json.load(f)

print("Total number of uniprot human verified proteins:", len(protein_data))

with open(druggable_proteins_file_path, 'r') as f:
    druggable_proteins = f.read().splitlines()

with open(approved_druggable_proteins_file_path, 'r') as f:
    approved_druggable_proteins = f.read().splitlines()

print("Number of druggable proteins:", len(druggable_proteins))
print("Number of approved druggable proteins:", len(approved_druggable_proteins))




properties = (pd.read_json("/content/drive/MyDrive/protein_props/features/protein_props.json")).transpose()
is_druggable = [1 if i in druggable_proteins else 0 for i in properties.index]
is_approved_druggable = [1 if i in approved_druggable_proteins else 0 for i in properties.index]

properties["is_druggable"] = is_druggable
properties["is_approved_druggable"] = is_approved_druggable

PCP_properties = properties.copy()
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
amino_acid_percent = {i:[] for i in amino_acids}
for i in PCP_properties['Amino Acid Percent']:
  for aa in amino_acids:
    amino_acid_percent[aa].append(i[aa])
for aa in amino_acids:
  PCP_properties = pd.concat([PCP_properties, pd.Series(amino_acid_percent[aa], index = PCP_properties.index, name = f"Amino Acid Percent {aa}")], axis = 1)

PCP_properties[f"Molar Extinction Coefficient 1"] = pd.Series([x[0] for x in PCP_properties['Molar Extinction Coefficient']], index = PCP_properties.index)
PCP_properties[f"Molar Extinction Coefficient 2"] = pd.Series([x[1] for x in PCP_properties['Molar Extinction Coefficient']], index = PCP_properties.index)

PCP_properties[f"Secondary Structure helix"] = pd.Series([x[0] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)
PCP_properties[f"Secondary Structure turn"] = pd.Series([x[1] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)
PCP_properties[f"Secondary Structure sheet"] = pd.Series([x[2] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)

PCP_properties.drop(columns = ['Amino Acid Count','Amino Acid Percent',"Molar Extinction Coefficient","Flexibility","Secondary Structure",'Sequence'], inplace = True)
PCP_properties['Sequence Length'] = PCP_properties['Sequence Length'].astype(int)
PCP_properties[['Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7']] = PCP_properties[['Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7']].astype(float)

with open("/content/drive/MyDrive/protein_props/features/gdpc_encodings.json", 'r') as file:
    data = json.load(file)
gpdc_encodings = pd.DataFrame(data).transpose()

ppi = pd.read_json("/content/drive/MyDrive/protein_props/features/ppi.json").transpose()
ppi_network = pd.read_csv("/content/drive/MyDrive/protein_props/features/ppi_network_properties.csv")
ppi_network.index = ppi_network['Unnamed: 0']
ppi_network.drop(columns = ['Unnamed: 0'], inplace = True)
ppi = pd.concat([ppi, ppi_network], axis = 1)

glycolisation = pd.read_csv("/content/drive/MyDrive/protein_props/features/glycosylation.csv")
glycolisation.index = glycolisation['Unnamed: 0']
glycolisation.drop(columns = ['Unnamed: 0'], inplace = True)
ptm = pd.read_csv("/content/drive/MyDrive/protein_props/features/PTM_counts.csv")
ptm.index = ptm["Unnamed: 0"]
ptm.drop(columns = ['Unnamed: 0'], inplace = True)
ptm_counts = pd.concat([ptm, glycolisation], axis = 1)

with open("/content/drive/MyDrive/protein_props/features/subcellular_locations2.json", 'r') as file:
    data = json.load(file)
unique_groups = set()
for entry in data.values():
    if "general" in entry:
        for general_entry in entry["general"]:
            if "group" in general_entry: unique_groups.add(general_entry["group"])

unique_groups_list = list(unique_groups)

rows = []
for protein_id in PCP_properties.index:
    row = {group: 0 for group in unique_groups_list}
    if protein_id in data:
        for entry in data[protein_id].get("general", []):
            if "group" in entry and entry["group"] in unique_groups:
                row[entry["group"]] = 1
    row["protein_id"] = protein_id
    rows.append(row)

subcellular_data = pd.DataFrame(rows).set_index("protein_id")

domains = pd.read_csv("/content/drive/MyDrive/protein_props/features/data_top20_updated.csv")
domains.index = domains['Unnamed: 0']
domains.drop(columns = ['Unnamed: 0'], inplace = True)

flexibility = pd.read_csv("/content/drive/MyDrive/protein_props/features/flexibility_properties.csv")
flexibility.index = flexibility['Unnamed: 0']
flexibility.drop(columns = ['Unnamed: 0'], inplace = True)

latent_data = pd.read_csv("/content/drive/MyDrive/protein_props/features/latent_values.csv").transpose()
latent_data.columns = [f"Latent_Value_{i+1}" for i in latent_data.columns]
final_data = pd.concat([PCP_properties,gpdc_encodings, ptm_counts, ppi, subcellular_data, domains, flexibility, latent_data], axis = 1).dropna()
features_list = final_data.columns
features_list = features_list.drop(['is_druggable','is_approved_druggable'])
features_list = list(features_list)
print(features_list)
print(len(features_list))


Total number of uniprot human verified proteins: 20434
Number of druggable proteins: 3345
Number of approved druggable proteins: 2652
['Sequence Length', 'Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7', 'Amino Acid Percent A', 'Amino Acid Percent C', 'Amino Acid Percent D', 'Amino Acid Percent E', 'Amino Acid Percent F', 'Amino Acid Percent G', 'Amino Acid Percent H', 'Amino Acid Percent I', 'Amino Acid Percent K', 'Amino Acid Percent L', 'Amino Acid Percent M', 'Amino Acid Percent N', 'Amino Acid Percent P', 'Amino Acid Percent Q', 'Amino Acid Percent R', 'Amino Acid Percent S', 'Amino Acid Percent T', 'Amino Acid Percent V', 'Amino Acid Percent W', 'Amino Acid Percent Y', 'Molar Extinction Coefficient 1', 'Molar Extinction Coefficient 2', 'Secondary Structure helix', 'Secondary Structure turn', 'Secondary Structure sheet', 'aliphatic_aliphatic', 'aliphatic_positive', 'aliphatic_negative', 'aliphatic_uncharged', 'aliphatic_aromatic',

In [32]:
#for splitting of data
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import ADASYN, SMOTE

def data_splitting(x_sample, y_sample, mode="default", scaler="none"):
  druggable_indices = (y_sample == 1)  # Assuming 1 represents druggable
  non_druggable_indices = (y_sample == 0)  # Assuming 0 represents non-druggable

  druggable_X = x_sample[druggable_indices]
  druggable_y = y_sample[druggable_indices]

  non_druggable_X = x_sample[non_druggable_indices]
  non_druggable_y = y_sample[non_druggable_indices]

  class_size = 600
  druggable_X_remaining, druggable_X_test, druggable_y_remaining, druggable_y_test = train_test_split(druggable_X, druggable_y, test_size=class_size, random_state=123)
  non_druggable_X_remaining, non_druggable_X_test, non_druggable_y_remaining, non_druggable_y_test = train_test_split(non_druggable_X, non_druggable_y, test_size= class_size, random_state=123)

  X_test = pd.concat((druggable_X_test, non_druggable_X_test))
  y_test = pd.concat((druggable_y_test, non_druggable_y_test))
  X_train = pd.concat((druggable_X_remaining, non_druggable_X_remaining))
  y_train = pd.concat((druggable_y_remaining, non_druggable_y_remaining))
  X_train, y_train = shuffle(X_train, y_train, random_state=123)
  if mode == "default":
    pass
  elif mode == "adasyn":
    ada = ADASYN(random_state=42)
    X_train, y_train = ada.fit_resample(X_train, y_train)
  elif mode == "smote":
    smt = SMOTE(random_state=42)
    X_train, y_train = smt.fit_resample(X_train, y_train)

  if scaler == "std":
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
  elif scaler == "minmax":
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
  elif scaler == "none":
    pass

  return X_train, X_test, y_train, y_test


In [33]:
X_train, X_test, y_train, y_test = data_splitting(final_data[features_list], final_data['is_druggable'])
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((19073, 183), (1200, 183), (19073,), (1200,))

In [53]:
import numpy as np
np.bincount(y_train), np.bincount(y_test)

(array([16349,  2724]), array([600, 600]))

### Experimentation to understand the sensitivity of XGBoost

In [46]:
# XGBOOST Model
import xgboost as xgb
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=27)
xgb_model.fit(X_train, y_train)

In [47]:
y_pred = xgb_model.predict(X_test)
print("Accuracy_total:", xgb_model.score(X_test, y_test))
print("Accuracy_druggable:", xgb_model.score(X_test[y_test == 1], y_test[y_test == 1]))
print("Accuracy_non-druggable:", xgb_model.score(X_test[y_test == 0], y_test[y_test == 0]))

Accuracy_total: 0.6358333333333334
Accuracy_druggable: 0.305
Accuracy_non-druggable: 0.9666666666666667


In [48]:
print("Training_Accuracy_total:", xgb_model.score(X_train, y_train))
print("Training_Accuracy_druggable:", xgb_model.score(X_train[y_train == 1], y_train[y_train == 1]))
print("Training_Accuracy_non-druggable:", xgb_model.score(X_train[y_train == 0], y_train[y_train == 0]))

Training_Accuracy_total: 0.9929219315262413
Training_Accuracy_druggable: 0.9515418502202643
Training_Accuracy_non-druggable: 0.9998165025383815


In [49]:
xgb_model2 = xgb.XGBClassifier(objective='binary:logistic', random_state=27)
# cross validate
from sklearn.model_selection import cross_val_score
scores = cross_val_score(xgb_model2, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)

Cross-validation scores: [0.87260813 0.87916121 0.87365662 0.8765076  0.87860514]


In [50]:
xgb_model3 = xgb.XGBClassifier(objective='binary:logistic', random_state=27)
scores = cross_val_score(xgb_model3, X_test, y_test, cv=5)
print("Cross-validation scores:", scores)

Cross-validation scores: [0.7125     0.70416667 0.75416667 0.70416667 0.69583333]


In [51]:
xgb_model4 = xgb.XGBClassifier(objective='binary:logistic', random_state=27)
xgb_model4.fit(X_test, y_test)


In [52]:
print("Training_accuracy_total", xgb_model4.score(X_test, y_test))
print("Trainign_accuracy_druggable", xgb_model4.score(X_test[y_test == 1], y_test[y_test == 1]))
print("Training_accuracy_non-druggable", xgb_model4.score(X_test[y_test == 0], y_test[y_test == 0]))

print("Test_accuracy_total", xgb_model4.score(X_train, y_train))
print("Test_accuracy_druggable", xgb_model4.score(X_train[y_train == 1], y_train[y_train == 1]))
print("Test_accuracy_non-druggable", xgb_model4.score(X_train[y_train == 0], y_train[y_train == 0]))

Training_accuracy_total 1.0
Trainign_accuracy_druggable 1.0
Training_accuracy_non-druggable 1.0
Test_accuracy_total 0.7155665076285849
Test_accuracy_druggable 0.723568281938326
Test_accuracy_non-druggable 0.7142332864395375


### Feature Selection Scores

In [59]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [74]:
# On entire data
X_combined = np.concatenate((X_train, X_test))
y_combined = np.concatenate((y_train, y_test))

# shuffling
X_combined, y_combined = shuffle(X_combined, y_combined, random_state=123)
X_combined.shape, y_combined.shape

((20273, 183), (20273,))

In [75]:
xgb_model_fs1 = xgb.XGBClassifier(objective='binary:logistic', random_state=27)
xgb_model_fs1.fit(X_combined, y_combined)

print("Accuracy_Total:", xgb_model_fs1.score(X_combined, y_combined))
print("Accuracy_Druggable:", xgb_model_fs1.score(X_combined[y_combined == 1], y_combined[y_combined == 1]))
print("Accuracy_Non_Druggable", xgb_model_fs1.score(X_combined[y_combined == 0], y_combined[y_combined == 0]))

Accuracy_Total: 0.9866817935184728
Accuracy_Druggable: 0.924187725631769
Accuracy_Non_Druggable 0.998937990441914


In [76]:
fs1_scores = xgb_model_fs1.feature_importances_

In [68]:
### Now we now xgboost is sensitive to skewness

In [77]:
np.bincount(y_combined)

array([16949,  3324])

In [78]:
X_combined_druggable = X_combined[y_combined == 1]
X_combined_non_druggable = X_combined[y_combined == 0]

X_combined_non_druggable_partitions = np.array_split(X_combined_non_druggable, 5)

print("Number of partitions:", len(X_combined_non_druggable_partitions))
for partition in X_combined_non_druggable_partitions:
  print(partition.shape)

Number of partitions: 5
(3390, 183)
(3390, 183)
(3390, 183)
(3390, 183)
(3389, 183)


In [80]:
xgb_models = []
training_metrics = {}
for i, partition in enumerate(X_combined_non_druggable_partitions):
  X_train_new = np.concatenate((X_combined_druggable, partition))
  y_train_new = np.concatenate((np.ones(X_combined_druggable.shape[0]), np.zeros(partition.shape[0])))
  X_train_new, y_train_new = shuffle(X_train_new, y_train_new, random_state=123)
  xgb_model_fs2 = xgb.XGBClassifier(objective='binary:logistic', random_state=27)
  xgb_model_fs2.fit(X_train_new, y_train_new)
  xgb_models.append(xgb_model_fs2)

  training_metrics[f"partition_{i}"] = {
      "accuracy_total" : xgb_model_fs2.score(X_train_new, y_train_new),
      "accuracy_druggable": xgb_model_fs2.score(X_train_new[y_train_new == 1], y_train_new[y_train_new == 1]),
      "accuracy_non-druggable": xgb_model_fs2.score(X_train_new[y_train_new == 0], y_train_new[y_train_new == 0])
  }

In [81]:
test_metrics = {}
for i in range(5):
  model = xgb_models[i]
  remaining_non_druggable_partitions = []
  for j in range(5):
    if j != i:
      remaining_non_druggable_partitions.append(X_combined_non_druggable_partitions[j])
  remaining_non_druggable_partitions = np.concatenate(remaining_non_druggable_partitions)
  test_metrics[f"partition_{i}"] = {
      "accuracy_non_druggable": model.score(remaining_non_druggable_partitions, np.zeros(remaining_non_druggable_partitions.shape[0]))
  }


In [84]:
training_metrics, test_metrics

({'partition_0': {'accuracy_total': 1.0,
   'accuracy_druggable': 1.0,
   'accuracy_non-druggable': 1.0},
  'partition_1': {'accuracy_total': 0.9995531724754245,
   'accuracy_druggable': 0.9996991576413959,
   'accuracy_non-druggable': 0.9994100294985251},
  'partition_2': {'accuracy_total': 0.9997021149836163,
   'accuracy_druggable': 0.9996991576413959,
   'accuracy_non-druggable': 0.9997050147492625},
  'partition_3': {'accuracy_total': 1.0,
   'accuracy_druggable': 1.0,
   'accuracy_non-druggable': 1.0},
  'partition_4': {'accuracy_total': 1.0,
   'accuracy_druggable': 1.0,
   'accuracy_non-druggable': 1.0}},
 {'partition_0': {'accuracy_non_druggable': 0.7558079504388229},
  'partition_1': {'accuracy_non_druggable': 0.7569879784644885},
  'partition_2': {'accuracy_non_druggable': 0.7559554539420311},
  'partition_3': {'accuracy_non_druggable': 0.7523416181134301},
  'partition_4': {'accuracy_non_druggable': 0.7589970501474926}})

In [86]:
def get_dict_form(feature_scores):
  return {feature: score for feature, score in zip(features_list, feature_scores)}

In [87]:
model_fs_scores = {
    "All_Data": get_dict_form(xgb_model_fs1.feature_importances_),
    "Partition_1": get_dict_form(xgb_models[0].feature_importances_),
    "Partition_2": get_dict_form(xgb_models[1].feature_importances_),
    "Partition_3": get_dict_form(xgb_models[2].feature_importances_),
    "Partition_4": get_dict_form(xgb_models[3].feature_importances_),
    "Partition_5": get_dict_form(xgb_models[4].feature_importances_)
}

In [90]:
df = pd.DataFrame(model_fs_scores)
df.to_csv("fs.csv")