<a href="https://colab.research.google.com/github/Sirczechs-Kaustubh/Blood_brain_barrier_ML/blob/main/blood_brain_barrierML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rdkit-pypi

In [None]:
from rdkit import Chem

In [None]:
m = Chem.MolFromSmiles('CC(=O)NC1=CC=C(C=C1)O')

In [None]:
from rdkit.Chem import Draw
img = Draw.MolToImage(m)
img

In [None]:
m.GetNumAtoms()


In [None]:
m2 = Chem.AddHs(m)
m2.GetNumAtoms()

In [None]:
img = Draw.MolToImage(m2)
img

In [None]:
from rdkit.Chem import Descriptors
Descriptors.MolWt(m)

In [None]:
from rdkit import DataStructs
from rdkit.Chem import AllChem

mol1 = Chem.MolFromSmiles('CC(=O)NC1=CC=C(C=C1)O') # Paracetamol
mol2 = Chem.MolFromSmiles('CN1C=NC2=C1C(=O)N(C(=O)N2C)C') # Caffeine
mol3 = Chem.MolFromSmiles('CN1C2=C(C(=O)N(C1=O)C)NC=N2') # Theophylline

fp1 = AllChem.GetMorganFingerprint(mol1, 4)
fp2 = AllChem.GetMorganFingerprint(mol2, 4)
fp3 = AllChem.GetMorganFingerprint(mol3, 4)

print(f"Para-Caff-{DataStructs.TanimotoSimilarity(fp1, fp2)}\nPara-Theo-{DataStructs.TanimotoSimilarity(fp1, fp3)}\nCaff-Theo-{DataStructs.TanimotoSimilarity(fp2, fp3)}")

In [None]:
print(mol2)

In [None]:
from rdkit.Chem.Draw import SimilarityMaps

AllChem.ComputeGasteigerCharges(mol2)
contribs = [mol2.GetAtomWithIdx(i).GetDoubleProp('_GasteigerCharge') for i in range(mol2.GetNumAtoms())]
fig = SimilarityMaps.GetSimilarityMapFromWeights(mol2, contribs, colorMap='Blues', contourLines=10)

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
#import rdkit.Chem
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors

In [None]:
data_drug = pd.read_excel('/content/BBB_datasets.xlsx')

In [None]:
data_drug.head()

In [None]:
smiles = data_drug['SMILES']
mol_descriptors = []

In [None]:
data_drug['SMILES']

In [None]:
for i in data_drug['SMILES']:
    moler = Chem.MolFromSmiles(i)
    if True:
        try:
            calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
            vector = calc.CalcDescriptors(moler)
            mol_descriptors.append(vector)
        except:
            print(i)


In [None]:
# create an array of the 200 mol descriptors
cols_mols = np.asarray(Descriptors._descList)

desc_df = pd.DataFrame(mol_descriptors, columns = cols_mols)


In [None]:
desc_df.head()

In [None]:
data_drug.head()

In [None]:
desc_df.columns

In [None]:
# Get the current column names
old_names = desc_df.columns.tolist()

# Create a dictionary mapping old names to new names
new_names = {old: str(i) for i, old in enumerate(old_names)}

# Rename the columns
desc_df.rename(columns=new_names, inplace=True)


In [None]:
desc_df.head()

In [None]:
data_drug["Class"].unique()

In [None]:
desc_df["Label"] = data_drug["Class"].map({'BBB+': 1, 'BBB-': 0})

In [None]:
desc_df.head()

In [None]:
desc_df.isna().sum().sum()

In [None]:
desc_df=desc_df.dropna()

In [None]:
desc_df.isna().sum().sum()

In [None]:
import matplotlib.pyplot as plt

desc_df['Label'].value_counts().plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Theres a class imbalance
# 400 positive labels and 200 Negative labels

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef


In [None]:
from sklearn.svm import SVC
X = desc_df.drop('Label', axis=1)
y = desc_df['Label']

# Define random state for reproducibility
random_state = 410

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Create a RandomOverSampler object
ros = RandomOverSampler(random_state=random_state)

# Fit and apply the transform
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Create a pipeline with a standard scaler
pipeline = make_pipeline(StandardScaler(),SVC(gamma="auto",random_state=random_state))

# Fit the pipeline to the resampled data
pipeline.fit(X_resampled, y_resampled)

In [None]:
y_pred = pipeline.predict(X_test)
f1 = f1_score(y_pred,y_test,pos_label=1)
accuracy = accuracy_score(y_pred,y_test)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_test))
disp.plot()
plt.title('Confusion Matrix for SVM Classifier')
plt.show()

In [None]:
print(f"Accuracy using SVM {accuracy}\nF1 Score of SVM {f1}")
