In [30]:
# Mount google drive at /content/drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
# Set seeds
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import json
import pandas as pd
np.random.seed(42)
tf.random.set_seed(42)

In [32]:
# Fetching PCP properties of druggable and non-druggable proteins
data_file_path = "/content/drive/MyDrive/protein_props/features/protein_props.json"
druggable_proteins_file_path = "/content/drive/MyDrive/protein_props/druggable_proteins.txt"
approved_druggable_proteins_file_path = "/content/drive/MyDrive/protein_props/approved_druggable_proteins.txt"

with open(data_file_path, 'r') as f:
    protein_data = json.load(f)

print("Total number of uniprot human verified proteins:", len(protein_data))

with open(druggable_proteins_file_path, 'r') as f:
    druggable_proteins = f.read().splitlines()

with open(approved_druggable_proteins_file_path, 'r') as f:
    approved_druggable_proteins = f.read().splitlines()

print("Number of druggable proteins:", len(druggable_proteins))
print("Number of approved druggable proteins:", len(approved_druggable_proteins))

properties = (pd.read_json("/content/drive/MyDrive/protein_props/features/protein_props.json")).transpose()
is_druggable = [1 if i in druggable_proteins else 0 for i in properties.index]
is_approved_druggable = [1 if i in approved_druggable_proteins else 0 for i in properties.index]

properties["is_druggable"] = is_druggable
properties["is_approved_druggable"] = is_approved_druggable

PCP_properties = properties.copy()
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
amino_acid_percent = {i:[] for i in amino_acids}
for i in PCP_properties['Amino Acid Percent']:
  for aa in amino_acids:
    amino_acid_percent[aa].append(i[aa])
for aa in amino_acids:
  PCP_properties = pd.concat([PCP_properties, pd.Series(amino_acid_percent[aa], index = PCP_properties.index, name = f"Amino Acid Percent {aa}")], axis = 1)

PCP_properties[f"Molar Extinction Coefficient 1"] = pd.Series([x[0] for x in PCP_properties['Molar Extinction Coefficient']], index = PCP_properties.index)
PCP_properties[f"Molar Extinction Coefficient 2"] = pd.Series([x[1] for x in PCP_properties['Molar Extinction Coefficient']], index = PCP_properties.index)

PCP_properties[f"Secondary Structure helix"] = pd.Series([x[0] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)
PCP_properties[f"Secondary Structure turn"] = pd.Series([x[1] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)
PCP_properties[f"Secondary Structure sheet"] = pd.Series([x[2] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)

PCP_properties.drop(columns = ['Amino Acid Count','Amino Acid Percent',"Molar Extinction Coefficient","Flexibility","Secondary Structure",'Sequence'], inplace = True)
PCP_properties['Sequence Length'] = PCP_properties['Sequence Length'].astype(int)
PCP_properties[['Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7']] = PCP_properties[['Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7']].astype(float)

with open("/content/drive/MyDrive/protein_props/features/gdpc_encodings.json", 'r') as file:
    data = json.load(file)
gpdc_encodings = pd.DataFrame(data).transpose()

ppi = pd.read_json("/content/drive/MyDrive/protein_props/features/ppi.json").transpose()
ppi_network = pd.read_csv("/content/drive/MyDrive/protein_props/features/ppi_network_properties.csv")
ppi_network.index = ppi_network['Unnamed: 0']
ppi_network.drop(columns = ['Unnamed: 0'], inplace = True)
ppi = pd.concat([ppi, ppi_network], axis = 1)

glycolisation = pd.read_csv("/content/drive/MyDrive/protein_props/features/glycosylation.csv")
glycolisation.index = glycolisation['Unnamed: 0']
glycolisation.drop(columns = ['Unnamed: 0'], inplace = True)
ptm = pd.read_csv("/content/drive/MyDrive/protein_props/features/PTM_counts.csv")
ptm.index = ptm["Unnamed: 0"]
ptm.drop(columns = ['Unnamed: 0'], inplace = True)
ptm_counts = pd.concat([ptm, glycolisation], axis = 1)

with open("/content/drive/MyDrive/protein_props/features/subcellular_locations2.json", 'r') as file:
    data = json.load(file)
unique_groups = set()
for entry in data.values():
    if "general" in entry:
        for general_entry in entry["general"]:
            if "group" in general_entry: unique_groups.add(general_entry["group"])

unique_groups_list = list(unique_groups)

rows = []
for protein_id in PCP_properties.index:
    row = {group: 0 for group in unique_groups_list}
    if protein_id in data:
        for entry in data[protein_id].get("general", []):
            if "group" in entry and entry["group"] in unique_groups:
                row[entry["group"]] = 1
    row["protein_id"] = protein_id
    rows.append(row)

subcellular_data = pd.DataFrame(rows).set_index("protein_id")

domains = pd.read_csv("/content/drive/MyDrive/protein_props/features/data_top20_updated.csv")
domains.index = domains['Unnamed: 0']
domains.drop(columns = ['Unnamed: 0'], inplace = True)

flexibility = pd.read_csv("/content/drive/MyDrive/protein_props/features/flexibility_properties.csv")
flexibility.index = flexibility['Unnamed: 0']
flexibility.drop(columns = ['Unnamed: 0'], inplace = True)

latent_data = pd.read_csv("/content/drive/MyDrive/protein_props/features/latent_values.csv").transpose()
latent_data.columns = [f"Latent_Value_{i+1}" for i in latent_data.columns]
final_data = pd.concat([PCP_properties,gpdc_encodings, ptm_counts, ppi, subcellular_data, domains, flexibility, latent_data], axis = 1).dropna()
features_list = final_data.columns
features_list = features_list.drop(['is_druggable','is_approved_druggable'])
features_list = list(features_list)
print(features_list)
print(len(features_list))


Total number of uniprot human verified proteins: 20434
Number of druggable proteins: 3345
Number of approved druggable proteins: 2652
['Sequence Length', 'Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7', 'Amino Acid Percent A', 'Amino Acid Percent C', 'Amino Acid Percent D', 'Amino Acid Percent E', 'Amino Acid Percent F', 'Amino Acid Percent G', 'Amino Acid Percent H', 'Amino Acid Percent I', 'Amino Acid Percent K', 'Amino Acid Percent L', 'Amino Acid Percent M', 'Amino Acid Percent N', 'Amino Acid Percent P', 'Amino Acid Percent Q', 'Amino Acid Percent R', 'Amino Acid Percent S', 'Amino Acid Percent T', 'Amino Acid Percent V', 'Amino Acid Percent W', 'Amino Acid Percent Y', 'Molar Extinction Coefficient 1', 'Molar Extinction Coefficient 2', 'Secondary Structure helix', 'Secondary Structure turn', 'Secondary Structure sheet', 'aliphatic_aliphatic', 'aliphatic_positive', 'aliphatic_negative', 'aliphatic_uncharged', 'aliphatic_aromatic',

In [33]:
#for splitting of data
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import ADASYN, SMOTE

def get_data(x_sample, y_sample):
  return np.array(x_sample), np.array(y_sample)

def data_splitting(x_sample, y_sample, mode="default", scaler="none", class_size=600):
  druggable_indices = (y_sample == 1)  # Assuming 1 represents druggable
  non_druggable_indices = (y_sample == 0)  # Assuming 0 represents non-druggable

  druggable_X = x_sample[druggable_indices]
  druggable_y = y_sample[druggable_indices]

  non_druggable_X = x_sample[non_druggable_indices]
  non_druggable_y = y_sample[non_druggable_indices]

  druggable_X_remaining, druggable_X_test, druggable_y_remaining, druggable_y_test = train_test_split(druggable_X, druggable_y, test_size=class_size, random_state=123)
  non_druggable_X_remaining, non_druggable_X_test, non_druggable_y_remaining, non_druggable_y_test = train_test_split(non_druggable_X, non_druggable_y, test_size= class_size, random_state=123)

  X_test = pd.concat((druggable_X_test, non_druggable_X_test))
  y_test = pd.concat((druggable_y_test, non_druggable_y_test))
  X_train = pd.concat((druggable_X_remaining, non_druggable_X_remaining))
  y_train = pd.concat((druggable_y_remaining, non_druggable_y_remaining))
  X_train, y_train = shuffle(X_train, y_train, random_state=123)
  if mode == "default":
    pass
  elif mode == "adasyn":
    ada = ADASYN(random_state=42)
    X_train, y_train = ada.fit_resample(X_train, y_train)
  elif mode == "smote":
    smt = SMOTE(random_state=42)
    X_train, y_train = smt.fit_resample(X_train, y_train)

  if scaler == "std":
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
  elif scaler == "minmax":
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
  elif scaler == "none":
    pass

  return X_train, X_test, y_train, y_test


In [47]:
X_train, X_test, y_train, y_test = data_splitting(final_data[features_list], final_data['is_druggable'], mode="default", scaler="none", class_size=600)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((19073, 183), (1200, 183), (19073,), (1200,))

In [48]:
import numpy as np
np.bincount(y_train), np.bincount(y_test)

(array([16349,  2724]), array([600, 600]))

### Feature Selection Scores using the partition Method

In [50]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [51]:
# On entire data
X_combined = np.concatenate((X_train, X_test))
y_combined = np.concatenate((y_train, y_test))

# shuffling
X_combined, y_combined = shuffle(X_combined, y_combined, random_state=123)
X_combined.shape, y_combined.shape

((20273, 183), (20273,))

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
np.bincount(y_combined)

array([16949,  3324])

In [54]:
X_combined_druggable = X_combined[y_combined == 1]
X_combined_non_druggable = X_combined[y_combined == 0]

X_combined_non_druggable_partitions = np.array_split(X_combined_non_druggable, 5)

print("Number of partitions:", len(X_combined_non_druggable_partitions))
for partition in X_combined_non_druggable_partitions:
  print(partition.shape)

Number of partitions: 5
(3390, 183)
(3390, 183)
(3390, 183)
(3390, 183)
(3389, 183)


In [55]:
rf_models = []
training_metrics = {}
for i, partition in enumerate(X_combined_non_druggable_partitions):
  X_train_new = np.concatenate((X_combined_druggable, partition))
  y_train_new = np.concatenate((np.ones(X_combined_druggable.shape[0]), np.zeros(partition.shape[0])))
  X_train_new, y_train_new = shuffle(X_train_new, y_train_new, random_state=123)
  rf_model = RandomForestClassifier(random_state=27)
  rf_model.fit(X_train_new, y_train_new)
  rf_models.append(rf_model)
  training_metrics[f"partition_{i}"] = {
      "accuracy_total" : rf_model.score(X_train_new, y_train_new),
      "accuracy_druggable": rf_model.score(X_train_new[y_train_new == 1], y_train_new[y_train_new == 1]),
      "accuracy_non-druggable": rf_model.score(X_train_new[y_train_new == 0], y_train_new[y_train_new == 0])
  }

In [56]:
test_metrics = {}
for i in range(5):
  model = rf_models[i]
  remaining_non_druggable_partitions = []
  for j in range(5):
    if j != i:
      remaining_non_druggable_partitions.append(X_combined_non_druggable_partitions[j])
  remaining_non_druggable_partitions = np.concatenate(remaining_non_druggable_partitions)
  test_metrics[f"partition_{i}"] = {
      "accuracy_non_druggable": model.score(remaining_non_druggable_partitions, np.zeros(remaining_non_druggable_partitions.shape[0]))
  }


In [57]:
training_metrics, test_metrics

({'partition_0': {'accuracy_total': 1.0,
   'accuracy_druggable': 1.0,
   'accuracy_non-druggable': 1.0},
  'partition_1': {'accuracy_total': 0.9997021149836163,
   'accuracy_druggable': 0.9996991576413959,
   'accuracy_non-druggable': 0.9997050147492625},
  'partition_2': {'accuracy_total': 1.0,
   'accuracy_druggable': 1.0,
   'accuracy_non-druggable': 1.0},
  'partition_3': {'accuracy_total': 1.0,
   'accuracy_druggable': 1.0,
   'accuracy_non-druggable': 1.0},
  'partition_4': {'accuracy_total': 1.0,
   'accuracy_druggable': 1.0,
   'accuracy_non-druggable': 1.0}},
 {'partition_0': {'accuracy_non_druggable': 0.7434914079209382},
  'partition_1': {'accuracy_non_druggable': 0.7432701526661258},
  'partition_2': {'accuracy_non_druggable': 0.7362637362637363},
  'partition_3': {'accuracy_non_druggable': 0.736779998524965},
  'partition_4': {'accuracy_non_druggable': 0.7342920353982301}})

In [58]:
def get_dict_form(feature_scores):
  return {feature: score for feature, score in zip(features_list, feature_scores)}

In [59]:
mean_feature_importances = rf_models[0].feature_importances_ + rf_models[1].feature_importances_ + rf_models[2].feature_importances_ + rf_models[3].feature_importances_ + rf_models[4].feature_importances_
mean_feature_importances = mean_feature_importances / 5

In [60]:
model_fs_scores = {
    "Partition_1": get_dict_form(rf_models[0].feature_importances_),
    "Partition_2": get_dict_form(rf_models[1].feature_importances_),
    "Partition_3": get_dict_form(rf_models[2].feature_importances_),
    "Partition_4": get_dict_form(rf_models[3].feature_importances_),
    "Partition_5": get_dict_form(rf_models[4].feature_importances_),
    "Partition_Average": get_dict_form(mean_feature_importances)
}

In [61]:
df = pd.DataFrame(model_fs_scores)
df.to_csv("/content/drive/MyDrive/protein_props/feature_scores/model_rf_feature_scores.csv")

In [62]:
df.head()

Unnamed: 0,Partition_1,Partition_2,Partition_3,Partition_4,Partition_5,Partition_Average
Sequence Length,0.010309,0.009615,0.010339,0.008944,0.010734,0.009988
Molecular Weight,0.010871,0.010221,0.011254,0.011452,0.010724,0.010904
GRAVY,0.014522,0.015677,0.015875,0.014839,0.014297,0.015042
Isoelectric Point,0.008066,0.008584,0.007153,0.007637,0.008204,0.007929
Instability Index,0.023471,0.019416,0.019763,0.021853,0.021915,0.021283


### Experimenting with Increasing Number of Top Features based on Feature Selection scores from Partition Average

In [63]:
from sklearn.ensemble import RandomForestClassifier

def evaluate_model(features):
  X_test, X_train, y_test, y_train = data_splitting(final_data[features], final_data["is_druggable"], mode="default", scaler="none", class_size=2700)
  rf_model = RandomForestClassifier(random_state=27)
  rf_model.fit(X_train, y_train)
  return {
      "training_accuracy": rf_model.score(X_train, y_train),
      "training_accuracy_druggable": rf_model.score(X_train[y_train == 1], y_train[y_train == 1]),
      "training_accuracy_non_druggable": rf_model.score(X_train[y_train == 0], y_train[y_train == 0]),
      "test_accuracy": rf_model.score(X_test, y_test),
      "test_accuracy_druggable": rf_model.score(X_test[y_test == 1], y_test[y_test == 1]),
      "test_accuracy_non_druggable": rf_model.score(X_test[y_test == 0], y_test[y_test == 0])
  }


In [65]:
fs_scores = pd.read_csv("/content/drive/MyDrive/protein_props/feature_scores/model_rf_feature_scores.csv")
fs_scores_partition_avg = fs_scores["Partition_Average"].values
fs_scores_partition_avg = {feature: score for feature, score in zip(features_list, fs_scores_partition_avg)}
fs_scores_partition_avg = {k: v for k, v in sorted(fs_scores_partition_avg.items(), key=lambda item: item[1], reverse=True)}

metrics_with_fs_scores_partition_avg = {}
for i in range(1, len(features_list)+1):
  features = list(fs_scores_partition_avg.keys())[:i]
  metrics_with_fs_scores_partition_avg[i] = evaluate_model(features)
  print(f"{i} Done")

1 Done
2 Done
3 Done
4 Done
5 Done
6 Done
7 Done
8 Done
9 Done
10 Done
11 Done
12 Done
13 Done
14 Done
15 Done
16 Done
17 Done
18 Done
19 Done
20 Done
21 Done
22 Done
23 Done
24 Done
25 Done
26 Done
27 Done
28 Done
29 Done
30 Done
31 Done
32 Done
33 Done
34 Done
35 Done
36 Done
37 Done
38 Done
39 Done
40 Done
41 Done
42 Done
43 Done
44 Done
45 Done
46 Done
47 Done
48 Done
49 Done
50 Done
51 Done
52 Done
53 Done
54 Done
55 Done
56 Done
57 Done
58 Done
59 Done
60 Done
61 Done
62 Done
63 Done
64 Done
65 Done
66 Done
67 Done
68 Done
69 Done
70 Done
71 Done
72 Done
73 Done
74 Done
75 Done
76 Done
77 Done
78 Done
79 Done
80 Done
81 Done
82 Done
83 Done
84 Done
85 Done
86 Done
87 Done
88 Done
89 Done
90 Done
91 Done
92 Done
93 Done
94 Done
95 Done
96 Done
97 Done
98 Done
99 Done
100 Done
101 Done
102 Done
103 Done
104 Done
105 Done
106 Done
107 Done
108 Done
109 Done
110 Done
111 Done
112 Done
113 Done
114 Done
115 Done
116 Done
117 Done
118 Done
119 Done
120 Done
121 Done
122 Done
123 Done
1

In [66]:
df = pd.DataFrame(metrics_with_fs_scores_partition_avg).transpose()
df.to_csv("/content/drive/MyDrive/protein_props/feature_scores/feature_improvement_metrics_rf_partition_avg.csv")

In [67]:
df.head()

Unnamed: 0,training_accuracy,training_accuracy_druggable,training_accuracy_non_druggable,test_accuracy,test_accuracy_druggable,test_accuracy_non_druggable
1,0.603333,0.632963,0.573704,0.585625,0.663462,0.582216
2,0.99963,0.999259,1.0,0.564311,0.589744,0.563197
3,0.999815,0.99963,1.0,0.646675,0.612179,0.648186
4,0.999815,0.99963,1.0,0.654206,0.661859,0.65387
5,0.999815,0.99963,1.0,0.656962,0.658654,0.656888
