In [None]:
# Mount google drive at /content/drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Set seeds
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import json
import pandas as pd
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# Fetching PCP properties of druggable and non-druggable proteins
data_file_path = "/content/drive/MyDrive/protein_props.json"
druggable_proteins_file_path = "/content/drive/MyDrive/druggable_proteins.txt"
approved_druggable_proteins_file_path = "/content/drive/MyDrive/approved_druggable_proteins.txt"

with open(data_file_path, 'r') as f:
    protein_data = json.load(f)

print("Total number of uniprot human verified proteins:", len(protein_data))

with open(druggable_proteins_file_path, 'r') as f:
    druggable_proteins = f.read().splitlines()

with open(approved_druggable_proteins_file_path, 'r') as f:
    approved_druggable_proteins = f.read().splitlines()

print("Number of druggable proteins:", len(druggable_proteins))
print("Number of approved druggable proteins:", len(approved_druggable_proteins))

properties = (pd.read_json("/content/drive/MyDrive/protein_props.json")).transpose()
is_druggable = [1 if i in druggable_proteins else 0 for i in properties.index]
is_approved_druggable = [1 if i in approved_druggable_proteins else 0 for i in properties.index]

properties["is_druggable"] = is_druggable
properties["is_approved_druggable"] = is_approved_druggable

PCP_properties = properties.copy()
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
amino_acid_percent = {i:[] for i in amino_acids}
for i in PCP_properties['Amino Acid Percent']:
  for aa in amino_acids:
    amino_acid_percent[aa].append(i[aa])
for aa in amino_acids:
  PCP_properties = pd.concat([PCP_properties, pd.Series(amino_acid_percent[aa], index = PCP_properties.index, name = f"Amino Acid Percent {aa}")], axis = 1)

PCP_properties[f"Molar Extinction Coefficient 1"] = pd.Series([x[0] for x in PCP_properties['Molar Extinction Coefficient']], index = PCP_properties.index)
PCP_properties[f"Molar Extinction Coefficient 2"] = pd.Series([x[1] for x in PCP_properties['Molar Extinction Coefficient']], index = PCP_properties.index)

PCP_properties[f"Secondary Structure helix"] = pd.Series([x[0] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)
PCP_properties[f"Secondary Structure turn"] = pd.Series([x[1] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)
PCP_properties[f"Secondary Structure sheet"] = pd.Series([x[2] for x in PCP_properties['Secondary Structure']], index = PCP_properties.index)

PCP_properties.drop(columns = ['Amino Acid Count','Amino Acid Percent',"Molar Extinction Coefficient","Flexibility","Secondary Structure",'Sequence'], inplace = True)
PCP_properties['Sequence Length'] = PCP_properties['Sequence Length'].astype(int)
PCP_properties[['Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7']] = PCP_properties[['Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7']].astype(float)

with open("/content/drive/MyDrive/BDDF_Research/gdpc_encodings.json", 'r') as file:
    data = json.load(file)
gpdc_encodings = pd.DataFrame(data).transpose()

ppi = pd.read_json("/content/drive/MyDrive/ppi.json").transpose()
ppi_network = pd.read_csv("/content/drive/MyDrive/BDDF_Research/ppi_network_properties.csv")
ppi_network.index = ppi_network['Unnamed: 0']
ppi_network.drop(columns = ['Unnamed: 0'], inplace = True)
ppi = pd.concat([ppi, ppi_network], axis = 1)

glycolisation = pd.read_csv("/content/drive/MyDrive/glycosylation.csv")
glycolisation.index = glycolisation['Unnamed: 0']
glycolisation.drop(columns = ['Unnamed: 0'], inplace = True)
ptm = pd.read_csv("/content/drive/MyDrive/PTM_counts.csv")
ptm.index = ptm["Unnamed: 0"]
ptm.drop(columns = ['Unnamed: 0'], inplace = True)
ptm_counts = pd.concat([ptm, glycolisation], axis = 1)

with open("/content/drive/MyDrive/subcellular_locations2.json", 'r') as file:
    data = json.load(file)
unique_groups = set()
for entry in data.values():
    if "general" in entry:
        for general_entry in entry["general"]:
            if "group" in general_entry: unique_groups.add(general_entry["group"])

unique_groups_list = list(unique_groups)

rows = []
for protein_id in PCP_properties.index:
    row = {group: 0 for group in unique_groups_list}
    if protein_id in data:
        for entry in data[protein_id].get("general", []):
            if "group" in entry and entry["group"] in unique_groups:
                row[entry["group"]] = 1
    row["protein_id"] = protein_id
    rows.append(row)

subcellular_data = pd.DataFrame(rows).set_index("protein_id")

domains = pd.read_csv("/content/drive/MyDrive/BDDF_Research/data_top20_updated.csv")
domains.index = domains['Unnamed: 0']
domains.drop(columns = ['Unnamed: 0'], inplace = True)

flexibility = pd.read_csv("/content/drive/MyDrive/BDDF_Research/flexibility_properties.csv")
flexibility.index = flexibility['Unnamed: 0']
flexibility.drop(columns = ['Unnamed: 0'], inplace = True)

latent_data = pd.read_csv("/content/drive/MyDrive/BDDF_Research/latent_values.csv").transpose()
latent_data.columns = [f"Latent_Value_{i+1}" for i in latent_data.columns]
final_data = pd.concat([PCP_properties,gpdc_encodings, ptm_counts, ppi, subcellular_data, domains, flexibility, latent_data], axis = 1).dropna()
features_list = final_data.columns
y = final_data['is_approved_druggable']
features_list = features_list.drop(['is_druggable','is_approved_druggable'])
features_list = list(features_list)
print(features_list)
print(len(features_list))

Total number of uniprot human verified proteins: 20434
Number of druggable proteins: 3345
Number of approved druggable proteins: 2652
['Sequence Length', 'Molecular Weight', 'GRAVY', 'Isoelectric Point', 'Instability Index', 'Aromaticity', 'Charge at 7', 'Amino Acid Percent A', 'Amino Acid Percent C', 'Amino Acid Percent D', 'Amino Acid Percent E', 'Amino Acid Percent F', 'Amino Acid Percent G', 'Amino Acid Percent H', 'Amino Acid Percent I', 'Amino Acid Percent K', 'Amino Acid Percent L', 'Amino Acid Percent M', 'Amino Acid Percent N', 'Amino Acid Percent P', 'Amino Acid Percent Q', 'Amino Acid Percent R', 'Amino Acid Percent S', 'Amino Acid Percent T', 'Amino Acid Percent V', 'Amino Acid Percent W', 'Amino Acid Percent Y', 'Molar Extinction Coefficient 1', 'Molar Extinction Coefficient 2', 'Secondary Structure helix', 'Secondary Structure turn', 'Secondary Structure sheet', 'aliphatic_aliphatic', 'aliphatic_positive', 'aliphatic_negative', 'aliphatic_uncharged', 'aliphatic_aromatic',

In [None]:
#for splitting of data
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import ADASYN, SMOTE

def get_data(x_sample, y_sample):
  return np.array(x_sample), np.array(y_sample)

def data_splitting(x_sample, y_sample, mode="default", scaler="none", class_size=600):
  druggable_indices = (y_sample == 1)  # Assuming 1 represents druggable
  non_druggable_indices = (y_sample == 0)  # Assuming 0 represents non-druggable

  druggable_X = x_sample[druggable_indices]
  druggable_y = y_sample[druggable_indices]

  non_druggable_X = x_sample[non_druggable_indices]
  non_druggable_y = y_sample[non_druggable_indices]

  druggable_X_remaining, druggable_X_test, druggable_y_remaining, druggable_y_test = train_test_split(druggable_X, druggable_y, test_size=class_size, random_state=123)
  non_druggable_X_remaining, non_druggable_X_test, non_druggable_y_remaining, non_druggable_y_test = train_test_split(non_druggable_X, non_druggable_y, test_size= class_size, random_state=123)

  X_test = pd.concat((druggable_X_test, non_druggable_X_test))
  y_test = pd.concat((druggable_y_test, non_druggable_y_test))
  X_train = pd.concat((druggable_X_remaining, non_druggable_X_remaining))
  y_train = pd.concat((druggable_y_remaining, non_druggable_y_remaining))
  X_train, y_train = shuffle(X_train, y_train, random_state=123)
  if mode == "default":
    pass
  elif mode == "adasyn":
    ada = ADASYN(random_state=42)
    X_train, y_train = ada.fit_resample(X_train, y_train)
  elif mode == "smote":
    smt = SMOTE(random_state=42)
    X_train, y_train = smt.fit_resample(X_train, y_train)

  if scaler == "std":
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
  elif scaler == "minmax":
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
  elif scaler == "none":
    pass

  return X_train, X_test, y_train, y_test

In [None]:
# rem-new-data is to extract only those proteins which are either approved druggable or non-druggable
# i.e., it excludes proteins which are non-approved but druggable
new_data = final_data.copy()
new_data['new_column'] = new_data['is_druggable'] + new_data['is_approved_druggable']
rem_new_data = new_data[new_data['new_column'] != 1]
rem_new_data.shape, np.bincount(rem_new_data['new_column'])

((19585, 186), array([16949,     0,  2636]))

In [None]:
X, y = rem_new_data[features_list], np.array(rem_new_data["is_approved_druggable"])
X.shape, y.shape

((19585, 183), (19585,))

In [None]:
X_train, X_test, y_train, y_test = data_splitting(rem_new_data[features_list], rem_new_data["is_approved_druggable"],mode = "scaler", class_size=600)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((18385, 183), (1200, 183), (18385,), (1200,))

In [None]:

X_train_druggable = X_train[y_train == 1]
X_train_non_druggable = X_train[y_train == 0]

X_train_non_druggable_partitions = np.array_split(X_train_non_druggable, int(len(X_train_non_druggable)/len(X_train_druggable)))
print(f"Splitting into {len(X_train_non_druggable_partitions)} partitions")

Splitting into 8 partitions


  return bound(*args, **kwds)


In [None]:

import xgboost as xgb

xgb_models = []
for partition in X_train_non_druggable_partitions:
  X_combined = np.concatenate((X_train_druggable, partition))
  y_combined = np.concatenate((np.ones(len(X_train_druggable)), np.zeros(len(partition))))
  xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
  xgb_model.fit(X_combined, y_combined)
  xgb_models.append(xgb_model)

In [None]:
y_preds = []
for model in xgb_models:
  y_pred = model.predict(X_test)
  y_preds.append(y_pred)

majority_preds = np.mean(y_preds, axis=0)
majority_preds = np.round(majority_preds)

In [None]:

from sklearn.metrics import accuracy_score
accuracy_metrics = {}
for i, y_pred in enumerate(y_preds):
  accuracy_metrics[f"partition_{i}"]={
      "accuracy_total": accuracy_score(y_test, y_pred),
      "accuracy_druggable": accuracy_score(y_test[y_test == 1], y_pred[y_test == 1]),
      "accuracy_non_druggable": accuracy_score(y_test[y_test == 0], y_pred[y_test == 0]),
  }

accuracy_metrics["majority"] = {
    "accuracy_total": accuracy_score(y_test, majority_preds),
    "accuracy_druggable": accuracy_score(y_test[y_test == 1], majority_preds[y_test == 1]),
    "accuracy_non_druggable": accuracy_score(y_test[y_test == 0], majority_preds[y_test == 0]),
}

df = pd.DataFrame(accuracy_metrics).transpose()
df


Unnamed: 0,accuracy_total,accuracy_druggable,accuracy_non_druggable
partition_0,0.760833,0.778333,0.743333
partition_1,0.759167,0.768333,0.75
partition_2,0.760833,0.756667,0.765
partition_3,0.7575,0.785,0.73
partition_4,0.765,0.783333,0.746667
partition_5,0.773333,0.781667,0.765
partition_6,0.756667,0.761667,0.751667
partition_7,0.763333,0.75,0.776667
majority,0.776667,0.766667,0.786667


In [None]:
!pip install sklearn-genetic-opt

Collecting sklearn-genetic-opt
  Downloading sklearn_genetic_opt-0.10.1-py3-none-any.whl.metadata (10.0 kB)
Collecting deap>=1.3.3 (from sklearn-genetic-opt)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading sklearn_genetic_opt-0.10.1-py3-none-any.whl (33 kB)
Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deap, sklearn-genetic-opt
Successfully installed deap-1.4.1 sklearn-genetic-opt-0.10.1


In [None]:
#Feature Selection with partition data using genetic algortithm

from sklearn_genetic import GAFeatureSelectionCV
from sklearn_genetic.plots import plot_fitness_evolution

preds = []
estimators = []
features = []

for i, partition in enumerate(X_train_non_druggable_partitions):
    X_combined = np.concatenate((X_train_druggable, partition))
    y_combined = np.concatenate((np.ones(len(X_train_druggable)), np.zeros(len(partition))))

    clf = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

    evolved_estimator = GAFeatureSelectionCV(
        estimator=clf,
        cv=3,
        scoring="accuracy",
        population_size=20,
        generations=5,
        n_jobs=-1,
        verbose=True,
        keep_top_k=2,
        elitism=True,
    )

    evolved_estimator.fit(X_combined, y_combined)
    estimators.append(evolved_estimator)
    selected_features = evolved_estimator.support_
    features.append(selected_features)

    print(f"Selected features for partition {i}: {selected_features}")

    y_predict_ga = evolved_estimator.predict(X_test)
    preds.append(y_predict_ga)

  pid = os.fork()
  pid = os.fork()


gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.734583	0.0128794  	0.758088   	0.714951   
1  	40    	0.739608	0.00988459 	0.758088   	0.722794   
2  	40    	0.744975	0.0077093  	0.758088   	0.732598   
3  	40    	0.744596	0.010914   	0.758088   	0.715441   
4  	40    	0.747439	0.00954136 	0.769118   	0.734559   
5  	40    	0.745392	0.0095499  	0.758088   	0.723284   
Selected features for partition 0: [ True False  True False False  True  True False  True False False False
  True False  True False  True False  True  True False  True  True  True
 False  True False False  True  True False False False False  True  True
 False False False False False  True False False False  True False False
  True False  True False False  True False  True  True  True False  True
 False  True  True  True False  True False False  True  True False False
 False  True  True  True False  True  True  True  True  True  True  True
 False  True  True  True  True False  True False  True False F



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.728664	0.0117536  	0.743137   	0.706373   
1  	40    	0.73511 	0.0106184  	0.749755   	0.707353   
2  	40    	0.739093	0.00804411 	0.749755   	0.720588   
3  	40    	0.737059	0.00993557 	0.751961   	0.712255   
4  	40    	0.737475	0.0106277  	0.751961   	0.71299    
5  	40    	0.738162	0.00837364 	0.751225   	0.721569   
Selected features for partition 1: [False False False  True  True  True  True False  True False  True  True
  True  True False False False  True False False False False  True  True
 False  True False  True  True  True  True  True False False  True  True
 False False False False False  True  True False  True False  True  True
  True  True  True False  True False  True  True  True  True False  True
  True False  True  True  True  True False False  True False False  True
  True False False False  True  True False False False  True  True False
 False  True False False False False  True False  True  True  



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.732868	0.0134266  	0.751961   	0.696814   
1  	40    	0.748051	0.0089501  	0.763971   	0.738725   
2  	40    	0.745674	0.0149934  	0.763971   	0.709314   
3  	40    	0.745735	0.00831363 	0.763971   	0.728431   
4  	40    	0.744167	0.0116615  	0.763971   	0.721814   
5  	40    	0.748848	0.00936489 	0.763971   	0.731373   
Selected features for partition 2: [False  True  True  True  True  True False False  True False  True  True
  True False  True False  True  True  True  True False False False False
 False False False  True False  True False  True False False False False
  True False  True  True False  True False  True False False  True  True
  True  True False  True  True  True False False  True False False  True
 False  True  True  True False False  True  True False  True  True  True
  True  True  True  True False False  True  True False  True False False
  True False  True  True  True  True  True  True False  True F



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.728836	0.0107143  	0.744118   	0.706863   
1  	40    	0.73462 	0.0100346  	0.744118   	0.698284   
2  	40    	0.734975	0.0102156  	0.744118   	0.708088   
3  	40    	0.737525	0.00693541 	0.744118   	0.716176   
4  	40    	0.736752	0.0089846  	0.744118   	0.709804   
5  	40    	0.73511 	0.00743535 	0.743873   	0.714706   
Selected features for partition 3: [ True  True  True  True  True  True  True False False False False  True
 False  True  True  True  True False False False False False  True False
 False  True False  True False False  True  True  True  True False False
 False  True False  True  True  True False  True  True  True  True False
  True False False  True False False  True False  True  True False  True
 False  True False  True False  True  True False  True False False False
  True  True False False  True False False  True  True False  True False
 False  True False False False False  True False False False  



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.725233	0.0113004  	0.747549   	0.707108   
1  	40    	0.736299	0.00910001 	0.75098    	0.714706   
2  	40    	0.739498	0.0081191  	0.75098    	0.72598    
3  	40    	0.737904	0.0100968  	0.75098    	0.714216   
4  	40    	0.74087 	0.0069341  	0.75098    	0.723039   
5  	40    	0.740858	0.00681976 	0.75098    	0.727941   
Selected features for partition 4: [False  True False False False False False False False  True  True  True
  True False  True  True  True False  True False  True  True False False
 False  True  True  True False False False  True  True False  True  True
 False False  True False  True False False False  True  True False  True
 False  True  True  True False  True False False False False  True  True
 False  True  True  True  True  True  True False  True  True False False
  True  True False False False  True  True  True  True  True False False
  True  True  True  True False  True  True False False  True  



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.731405	0.0124282  	0.751901   	0.706793   
1  	40    	0.73621 	0.00885641 	0.751901   	0.720765   
2  	40    	0.740586	0.00593588 	0.752884   	0.72836    
3  	40    	0.742609	0.0101751  	0.752884   	0.723465   
4  	40    	0.745685	0.00731215 	0.752884   	0.723707   
5  	40    	0.743295	0.0101576  	0.752884   	0.723216   
Selected features for partition 5: [ True  True False False False  True  True  True  True False False False
  True  True  True False False  True False False False  True False False
  True False False  True False False False  True  True  True  True  True
 False  True  True False  True  True  True  True  True  True False  True
  True False False  True False False  True False  True  True False  True
 False  True  True  True  True  True False False  True  True  True False
  True  True  True False False  True  True  True False False  True  True
  True False False False  True False  True False  True  True F



gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.725095	0.0132112  	0.746264   	0.704099   
1  	40    	0.738076	0.00828238 	0.750187   	0.718072   
2  	40    	0.743862	0.00603229 	0.750187   	0.729839   
3  	40    	0.74483 	0.00487978 	0.750187   	0.733517   
4  	40    	0.743298	0.0085899  	0.750187   	0.719296   
5  	40    	0.745407	0.00492246 	0.750187   	0.729594   
Selected features for partition 6: [False  True False  True  True False  True  True False False  True False
  True False  True  True False  True  True False False False  True False
  True False False  True False  True False  True  True False  True  True
 False False False False  True  True  True  True  True  True  True False
 False  True False False  True  True False  True  True False  True  True
  True  True False  True False  True  True False False False  True False
  True  True False False False False False False False False  True False
 False  True False  True False False  True False False False F



gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	0.72389	0.0126181  	0.744302   	0.704337   
1  	40    	0.734604	0.0063405  	0.743076   	0.720031   
2  	40    	0.736724	0.00520662 	0.748221   	0.729101   
3  	40    	0.736896	0.00763358 	0.748712   	0.718804   
4  	40    	0.737864	0.00700554 	0.748469   	0.721989   
5  	40    	0.736566	0.00658275 	0.748469   	0.721007   
Selected features for partition 7: [ True  True  True False False  True False False False False False  True
 False False False  True  True  True False  True False  True False False
  True False False False False  True  True False  True  True False False
  True False False  True  True  True  True  True  True  True False False
  True False  True False  True False False False False  True False  True
 False  True False False False False False  True  True  True  True False
  True  True  True  True False False  True False  True  True  True False
 False False False  True  True  True False False False False Fal



In [None]:
import xgboost as xgb
estimator_0_features = np.array([True, False, True, False, False, True, True, False, True, False, False, False,
True, False, True, False, True, False, True, True, False, True, True, True,
False, True, False, False, True, True, False, False, False, False, True, True,
False, False, False, False, False, True, False, False, False, True, False, False,
True, False, True, False, False, True, False, True, True, True, False, True,
False, True, True, True, False, True, False, False, True, True, False, False,
False, True, True, True, False, True, True, True, True, True, True, True,
False, True, True, True, True, False, True, False, True, False, False, True,
False, False, True, False, True, True, True, True, False, True, True, True,
False, False, True, False, False, True, True, False, True, True, True, False,
True, False, False, True, False, False, False, False, False, True, True, True,
False, True, True, False, False, False, True, True, False, False, False, True,
True, True, True, True, False, True, True, False, True, False, True, False,
False, True, True, False, False, True, False, False, True, False, False, False,
False, False, True, False, True, True, False, True, True, False, False, True,
False, True, False])
estimator_1_features = np.array([False, False, False, True, True, True, True, False, True, False, True, True,
True, True, False, False, False, True, False, False, False, False, True, True,
False, True, False, True, True, True, True, True, False, False, True, True,
False, False, False, False, False, True, True, False, True, False, True, True,
True, True, True, False, True, False, True, True, True, True, False, True,
True, False, True, True, True, True, False, False, True, False, False, True,
True, False, False, False, True, True, False, False, False, True, True, False,
False, True, False, False, False, False, True, False, True, True, True, True,
True, False, False, True, False, True, False, True, True, True, False, True,
True, True, False, False, True, False, True, False, True, False, True, True,
True, True, True, True, False, False, True, False, False, False, True, True,
True, True, True, True, True, True, False, False, True, False, True, True,
True, True, False, True, False, False, True, True, False, True, False, True,
True, True, True, False, False, False, True, True, False, True, False, True,
False, True, False, False, True, False, False, True, True, False, False, False,
False, True, True])
estimator_2_features = np.array([False, True, True, True, True, True, False, False, True, False, True, True,
 True, False, True, False, True, True, True, True, False, False, False, False,
 False, False, False, True, False, True, False, True, False, False, False, False,
 True, False, True, True, False, True, False, True, False, False, True, True,
 True, True, False, True, True, True, False, False, True, False, False, True,
 False, True, True, True, False, False, True, True, False, True, True, True,
 True, True, True, True, False, False, True, True, False, True, False, False,
 True, False, True, True, True, True, True, True, False, True, False, True,
 False, False, True, False, False, True, True, True, True, False, False, True,
 True, False, False, False, True, False, False, True, False, False, True, False,
 False, True, False, False, False, True, True, False, False, False, True, True,
 True, False, True, False, False, False, True, True, False, False, False, True,
 True, False, False, True, False, True, False, False, True, False, True, True,
 True, False, True, False, True, False, False, False, True, False, True, False,
 True, True, True, False, False, True, True, False, False, False, True, False,
 True, False, False])
estimator_3_features = np.array([True, True, True, True, True, True, True, False, False, False, False, True, False, True, True, True, True, False, False, False, False, False, True, False, False, True, False, True, False, False, True, True, True, True, False, False, False, True, False, True, True, True, False, True, True, True, True, False, True, False, False, True, False, False, True, False, True, True, False, True, False, True, False, True, False, True, True, False, True, False, False, False, True, True, False, False, True, False, False, True, True, False, True, False, False, True, False, False, False, False, True, False, False, False, True, False, True, False, False, False, False, False, True, False, True, True, True, False, True, True, False, True, True, True, False, True, True, True, False, True, False, False, True, True, False, False, True, False, False, True, False, False, True, False, False, False, True, False, False, True, False, False, False, True, True, False, False, True, True, True, True, False, False, True, False, False, False, False, False, True, False, True, False, False, False, True, True, False, True, True, False, True, True, False, True, False, True, False, False, False, False, False, True])



estimator_4_features = np.array([False, True, False, False, False, False, False, False, False, True, True, True,
 True, False, True, True, True, False, True, False, True, True, False, False,
 False, True, True, True, False, False, False, True, True, False, True, True,
 False, False, True, False, True, False, False, False, True, True, False, True,
 False, True, True, True, False, True, False, False, False, False, True, True,
 False, True, True, True, True, True, True, False, True, True, False, False,
 True, True, False, False, False, True, True, True, True, True, False, False,
 True, True, True, True, False, True, True, False, False, True, True, True,
 False, False, False, True, True, False, True, False, False, True, False, True,
 False, False, False, False, True, False, True, True, True, False, True, False,
 True, True, True, False, True, False, False, False, False, False, False, False,
 False, False, False, False, True, True, True, True, True, False, False, True,
 True, True, False, True, False, True, True, True, False, True, False, False,
 True, False, False, True, False, False, False, True, True, True, False, False,
 True, True, True, False, True, False, False, False, False, True, False, True,
 True, False, True])

estimator_5_features = np.array([True, True, False, False, False, True, True, True, True, False, False, False, True, True, True, False, False, True, False, False, False, True, False, False, True, False, False, True, False, False, False, True, True, True, True, True, False, True, True, False, True, True, True, True, True, True, False, True, True, False, False, True, False, False, True, False, True, True, False, True, False, True, True, True, True, True, False, False, True, True, True, False, True, True, True, False, False, True, True, True, False, False, True, True, True, False, False, False, True, False, True, False, True, True, False, True, True, True, False, False, True, False, True, True, True, False, False, False, False, False, False, False, True, True, True, False, True, True, True, False, True, False, True, True, False, False, True, True, True, True, False, True, True, True, True, True, False, True, True, True, True, False, True, True, True, False, True, False, False, True, True, False, False, True, False, False, False, True, True, False, True, True, False, True, True, False, False, False, True, False, False, False, True, False, False, True, False, True, True, False, True, True, True])

estimator_6_features = np.array([False, True, False, True, True, False, True, True, False, False, True, False, True, False, True, True, False, True, True, False, False, False, True, False, True, False, False, True, False, True, False, True, True, False, True, True, False, False, False, False, True, True, True, True, True, True, True, False, False, True, False, False, True, True, False, True, True, False, True, True, True, True, False, True, False, True, True, False, False, False, True, False, True, True, False, False, False, False, False, False, False, False, True, False, False, True, False, True, False, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, True, True, True, False, False, True, True, False, False, False, True, False, True, True, True, False, True, True, False, True, True, False, True, False, True, False, False, False, True, True, False, False, False, False, False, False, True, True, True, False, True, False, False, False, True, False, False, False, False, True, False, False, False, True, False, False, True, False, True, True, True, False, True, False, True, False, True, False, True, False, False, True, False, False, False, False, True, True]
)

estimator_7_features = np.array([True, True, True, False, False, True, False, False, False, False, False, True, False, False, False, True, True, True, False, True, False, True, False, False, True, False, False, False, False, True, True, False, True, True, False, False, True, False, False, True, True, True, True, True, True, True, False, False, True, False, True, False, True, False, False, False, False, True, False, True, False, True, False, False, False, False, False, True, True, True, True, False, True, True, True, True, False, False, True, False, True, True, True, False, False, False, True, True, True, False, False, False, False, False, True, True, False, True, True, False, False, True, False, False, True, True, False, True, True, True, False, True, True, False, True, False, True, False, True, False, False, True, True, False, True, False, True, True, False, False, True, True, True, False, False, True, True, False, False, False, True, True, True, False, True, False, True, True, False, True, False, False, False, True, False, False, False, False, True, False, True, False, False, False, True, True, False, False, True, False, False, True, False, False, False, True, True, True, False, False, True])

okk = [estimator_0_features,estimator_1_features,estimator_2_features,estimator_3_features,estimator_4_features, estimator_5_features, estimator_6_features, estimator_7_features]

models_list = []
for i, partition in enumerate(X_train_non_druggable_partitions[:-1]):
    X_combined = np.concatenate((X_train_druggable, partition))
    y_combined = np.concatenate((np.ones(len(X_train_druggable)), np.zeros(len(partition))))

    clf = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

    print(len(okk[i]))
    X_combined = X_combined[:, okk[i]]
    clf.fit(X_combined, y_combined)
    models_list.append(clf)

y_preds = []
for i,model in enumerate(models_list):
  y_pred = model.predict(np.asarray(X_test)[:, okk[i]])
  y_preds.append(y_pred)

majority_preds = np.mean(y_preds, axis=0)
majority_preds = np.round(majority_preds)

from sklearn.metrics import accuracy_score
accuracy_metrics = {}
for i, y_pred in enumerate(y_preds):
  accuracy_metrics[f"partition_{i}"]={
      "accuracy_total": accuracy_score(y_test, y_pred),
      "accuracy_druggable": accuracy_score(y_test[y_test == 1], y_pred[y_test == 1]),
      "accuracy_non_druggable": accuracy_score(y_test[y_test == 0], y_pred[y_test == 0]),
  }

accuracy_metrics["majority"] = {
    "accuracy_total": accuracy_score(y_test, majority_preds),
    "accuracy_druggable": accuracy_score(y_test[y_test == 1], majority_preds[y_test == 1]),
    "accuracy_non_druggable": accuracy_score(y_test[y_test == 0], majority_preds[y_test == 0]),
}

df = pd.DataFrame(accuracy_metrics).transpose()
df

183
183
183
183
183
183
183


Unnamed: 0,accuracy_total,accuracy_druggable,accuracy_non_druggable
partition_0,0.740833,0.751667,0.73
partition_1,0.745,0.745,0.745
partition_2,0.75,0.775,0.725
partition_3,0.735,0.745,0.725
partition_4,0.7675,0.776667,0.758333
partition_5,0.755833,0.766667,0.745
partition_6,0.736667,0.736667,0.736667
majority,0.7725,0.79,0.755


In [None]:
non_approved_druggable = new_data[new_data["new_column"] == 1]
non_approved_druggable.shape

(688, 186)

In [None]:
X_tests, y_tests = get_data(non_approved_druggable[features_list], non_approved_druggable["is_approved_druggable"])
X_tests.shape, y_tests.shape

((688, 183), (688,))

In [None]:
protein_names = non_approved_druggable.index
len(protein_names)

688

In [None]:

y_preds = []
y_probs = []
for i,model in enumerate(models_list):
  y_pred = model.predict(np.asarray(X_tests)[:, okk[i]])
  y_preds.append(y_pred)
  y_prob = model.predict_proba(np.asarray(X_tests)[:, okk[i]])[:,1]
  y_probs.append(y_prob)

majority_preds = np.mean(y_preds, axis=0)
majority_preds = np.round(majority_preds)

majority_probs = np.mean(y_probs, axis=0)

data = {
    "Protein": protein_names
}
for i,probs in enumerate(y_probs):
  data[f"Probability_Partition_{i+1}"] = probs
data["Mean_Probability"] = majority_probs
data["Majority_Prediction"] = majority_preds

In [None]:
df = pd.DataFrame(data)
df.to_csv("/content/drive/MyDrive/BDDF_Research/Genetic_Metrics/XGB_Genetic_DI.csv")