### Topological Feature Extraction Using Persistent Homology
##### We utilized persistent homology to extract topological features from two-dimensional molecular images of drugs and protein contact maps.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U giotto-tda

Collecting giotto-tda
  Downloading giotto_tda-0.6.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)
Collecting scikit-learn==1.3.2 (from giotto-tda)
  Downloading scikit_learn-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting giotto-ph>=0.2.1 (from giotto-tda)
  Downloading giotto_ph-0.2.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting pyflagser>=0.4.3 (from giotto-tda)
  Downloading pyflagser-0.4.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)
Collecting igraph>=0.9.8 (from giotto-tda)
  Downloading igraph-0.11.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting numpy>=1.19.1 (from giotto-tda)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00

### Drug Topological Features

In [None]:
import os
from PIL import Image
import numpy as np
import pandas as pd
from pathlib import Path
from gtda.homology import CubicalPersistence
from gtda.diagrams import BettiCurve, PersistenceLandscape
from tqdm import tqdm

base_dir = Path('/content/drive/MyDrive/Top_DTI')
folder_path = base_dir / "images" / "mol_images"


cubical_persistence = CubicalPersistence(n_jobs=-1)
BC = BettiCurve(n_bins=50, n_jobs=-1)
PL = PersistenceLandscape(n_bins=100, n_jobs=-1)


data = []


for filename in tqdm(os.listdir(folder_path), desc="Processing images"):
    if filename.endswith(".png"):
        image_path = os.path.join(folder_path, filename)


        drug_name = os.path.splitext(filename)[0]


        image = Image.open(image_path)
        channels = {
            "gray": np.array(image.convert("L"))[None, :, :],
            "red": np.array(image)[:, :, 0][None, :, :],
            "green": np.array(image)[:, :, 1][None, :, :],
            "blue": np.array(image)[:, :, 2][None, :, :]
        }


        all_features = {}

        for channel_name, channel_data in channels.items():
            persistence_diagrams = cubical_persistence.fit_transform(channel_data)

            # BettiCurve
            bc_features = BC.fit_transform(persistence_diagrams)
            bc_features_flatten = np.reshape(bc_features, -1)

            # PersistenceLandscape
            pl_features = PL.fit_transform(persistence_diagrams)
            pl_features_flatten = np.reshape(pl_features, -1)


            combined_features = np.concatenate([bc_features_flatten, pl_features_flatten])
            all_features[channel_name] = combined_features


        final_features = np.concatenate([
            all_features["gray"],
            all_features["red"],
            all_features["green"],
            all_features["blue"]
        ])


        data.append({"drug_name": drug_name, "structural_embedding": final_features})


Processing images:  53%|█████▎    | 2406/4505 [1:11:50<1:04:05,  1.83s/it]

In [None]:
embeddings_df = pd.DataFrame(data)
embeddings_df

In [None]:
import os
import numpy as np
import pandas as pd

outdir = base_dir/ "structure" / "s_embeddings"
outdir.mkdir(parents=True, exist_ok=True)

embeddings = np.array(embeddings_df['structural_embedding'].tolist())
drug_names = np.array(embeddings_df['drug_name'].tolist(), dtype=str)

np.save(os.path.join(outdir, "mol_image_embeddings.npy"), embeddings)
np.save(os.path.join(outdir, "mol_names.npy"), drug_names)

print("Saved embeddings and drug names successfully.")

### Protein Target Topological Features

In [None]:
import os
from PIL import Image
import numpy as np
import pandas as pd
from gtda.homology import CubicalPersistence
from gtda.diagrams import BettiCurve, PersistenceLandscape
from tqdm import tqdm


base_dir = Path('/content/drive/MyDrive/Top_DTI')
folder_path = base_dir / "images" / "esm_contacts"


cubical_persistence = CubicalPersistence(n_jobs=-1)
BC = BettiCurve(n_bins=50, n_jobs=-1)
PL = PersistenceLandscape(n_bins=100, n_jobs=-1)


contact_data = []


for filename in tqdm(os.listdir(folder_path), desc="Processing images"):
    if filename.endswith(".png"):
        image_path = os.path.join(folder_path, filename)


        drug_name = os.path.splitext(filename)[0]


        image = Image.open(image_path)
        channels = {
            "gray": np.array(image.convert("L"))[None, :, :],
            "red": np.array(image)[:, :, 0][None, :, :],
            "green": np.array(image)[:, :, 1][None, :, :],
            "blue": np.array(image)[:, :, 2][None, :, :]
        }


        all_features = {}

        for channel_name, channel_data in channels.items():
            persistence_diagrams = cubical_persistence.fit_transform(channel_data)

            # BettiCurve
            bc_features = BC.fit_transform(persistence_diagrams)
            bc_features_flatten = np.reshape(bc_features, -1)

            # PersistenceLandscape
            pl_features = PL.fit_transform(persistence_diagrams)
            pl_features_flatten = np.reshape(pl_features, -1)


            combined_features = np.concatenate([bc_features_flatten, pl_features_flatten])
            all_features[channel_name] = combined_features


        final_features = np.concatenate([
            all_features["gray"],
            all_features["red"],
            all_features["green"],
            all_features["blue"]
        ])


        contact_data.append({"protein_index": drug_name, "protein_structural_embedding": final_features})


Processing images: 100%|██████████| 2181/2181 [4:26:52<00:00,  7.34s/it]


In [None]:
contact_embeddings_df = pd.DataFrame(contact_data)
contact_embeddings_df

Unnamed: 0,protein_index,protein_structural_embedding
0,1185,"[58.0, 58.0, 58.0, 58.0, 58.0, 2578.0, 2578.0,..."
1,1186,"[38.0, 38.0, 38.0, 38.0, 38.0, 186.0, 186.0, 1..."
2,1187,"[50.0, 50.0, 1476.0, 1476.0, 674.0, 674.0, 674..."
3,1188,"[24.0, 24.0, 30.0, 22.0, 22.0, 22.0, 14.0, 14...."
4,1189,"[127.0, 127.0, 2758.0, 2758.0, 166.0, 166.0, 1..."
...,...,...
2176,180,"[65.0, 65.0, 1814.0, 1814.0, 285.0, 285.0, 285..."
2177,181,"[92.0, 92.0, 1318.0, 1318.0, 452.0, 452.0, 452..."
2178,182,"[94.0, 94.0, 1015.0, 224.0, 224.0, 167.0, 167...."
2179,183,"[26.0, 26.0, 936.0, 348.0, 348.0, 348.0, 182.0..."


In [None]:
import os
import numpy as np
import pandas as pd

outdir = base_dir/ "structure" / "s_embeddings"
outdir.mkdir(parents=True, exist_ok=True)

contact_embeddings = np.array(contact_embeddings_df['protein_structural_embedding'].tolist())
protein_names = np.array(contact_embeddings_df['protein_index'].tolist(), dtype=str)

np.save(os.path.join(outdir, "protein_contact_embeddings.npy"), contact_embeddings)
np.save(os.path.join(outdir, "protein_index.npy"), protein_names)

print("Saved embeddings and protein names successfully.")

Saved embeddings and protein names successfully.
