# Drug–Protein Interaction Pipeline
Includes:
1. SMILES to Morgan Fingerprint & Molecular Graph
2. Protein FASTA → ESM2 Embedding
3. Merge & Normalize Data for ML
4. Output DTI Dataset (.csv/.json)

In [33]:
# Force install
!pip install --upgrade --force-reinstall numpy

Collecting numpy
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have nu

In [34]:
# ⚙️ Install required packages
# Remove the specific numpy version constraint to allow pandas to install a compatible version
# !pip install "numpy<2.0" --upgrade
!pip install rdkit-pypi fair-esm biopython torch pandas numpy

import os, json
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from Bio import SeqIO
import torch
import esm

# Rest of the code remains the same



## 🔬 Drug Fingerprints and Graphs

In [35]:
smiles_list = [
    ("DB14975", "CC(C)N1C(=CC=N1)C2=C(C=CC=N2)COC3=CC=CC(=C3C=O)O"),  # Voxelotor

]
fingerprints = []
for drug_id, smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
    fp_array = np.array(fp)
    fingerprints.append({"drug_id": drug_id, "smiles": smiles, "fingerprint": fp_array.tolist()})

## 🧬 Protein Embeddings (ESM2)

In [36]:
# Load ESM2
model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
batch_converter = alphabet.get_batch_converter()

protein_data = [
    ("P68871", "HBB", "MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH"[:200]),

]
protein_embeddings = []

for uniprot, name, seq in protein_data:
    batch_labels, batch_strs, batch_tokens = batch_converter([(name, seq)])
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[6])
    token_representations = results["representations"][6]
    emb = token_representations[0, 1:len(seq)+1].mean(0).numpy()
    protein_embeddings.append({"uniprot": uniprot, "name": name, "embedding": emb.tolist()})

## 🧠 Create Combined Dataset

In [37]:
# Match drug–protein pairs and create dataset
dataset = []
for drug in fingerprints:
    for protein in protein_embeddings:
        dataset.append({
            "drug_id": drug["drug_id"],
            "drug_smiles": drug["smiles"],
            "drug_fingerprint": drug["fingerprint"],
            "protein_id": protein["uniprot"],
            "protein_embedding": protein["embedding"],
            "label": 1
        })

# Save as JSON
with open("drug_protein_dataset.json", "w") as f:
    json.dump(dataset, f)
print("Saved drug_protein_dataset.json")

Saved drug_protein_dataset.json


In [38]:
# Download JSON ML file input

from google.colab import files
files.download('drug_protein_dataset.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [39]:
# Download CSV file format

import pandas as pd
# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

# Save the DataFrame to a CSV file
df.to_csv("drug_protein_dataset.csv", index=False)
print("Saved drug_protein_dataset.csv")

# Download the CSV file
files.download('drug_protein_dataset.csv')

Saved drug_protein_dataset.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>