### <span style="color:yellow;">Topological Feature Extraction Using Persistent Homology </span> 
##### We utilized persistent homology to extract topological features from two-dimensional molecular images of drugs and protein contact maps. 

### Drug Topological Features

In [2]:
import os
from PIL import Image
import numpy as np
import pandas as pd
from pathlib import Path
from gtda.homology import CubicalPersistence
from gtda.diagrams import BettiCurve, PersistenceLandscape
from tqdm import tqdm

base_dir = Path('Path_to_base_directory')
folder_path = base_dir / "images" / "mol_images"


cubical_persistence = CubicalPersistence(n_jobs=-1)
BC = BettiCurve(n_bins=50, n_jobs=-1)
PL = PersistenceLandscape(n_bins=100, n_jobs=-1)


data = []


for filename in tqdm(os.listdir(folder_path), desc="Processing images"):
    if filename.endswith(".png"):
        image_path = os.path.join(folder_path, filename)

       
        drug_name = os.path.splitext(filename)[0]

        
        image = Image.open(image_path)
        channels = {
            "gray": np.array(image.convert("L"))[None, :, :],
            "red": np.array(image)[:, :, 0][None, :, :],
            "green": np.array(image)[:, :, 1][None, :, :],
            "blue": np.array(image)[:, :, 2][None, :, :]
        }

        
        all_features = {}
        
        for channel_name, channel_data in channels.items():
            persistence_diagrams = cubical_persistence.fit_transform(channel_data)

            # BettiCurve 
            bc_features = BC.fit_transform(persistence_diagrams)
            bc_features_flatten = np.reshape(bc_features, -1)

            # PersistenceLandscape 
            pl_features = PL.fit_transform(persistence_diagrams)
            pl_features_flatten = np.reshape(pl_features, -1)

           
            combined_features = np.concatenate([bc_features_flatten, pl_features_flatten])
            all_features[channel_name] = combined_features

        
        final_features = np.concatenate([
            all_features["gray"],
            all_features["red"],
            all_features["green"],
            all_features["blue"]
        ])

        
        data.append({"drug_name": drug_name, "structural_embedding": final_features})


Processing images: 100%|████████████████████| 4505/4505 [41:51<00:00,  1.79it/s]


In [3]:
embeddings_df = pd.DataFrame(data)
embeddings_df

Unnamed: 0,drug_name,structural_embedding
0,34,"[81.0, 18.0, 17.0, 17.0, 17.0, 18.0, 35.0, 31...."
1,2040,"[59.0, 17.0, 13.0, 10.0, 10.0, 9.0, 9.0, 17.0,..."
2,987,"[57.0, 13.0, 10.0, 10.0, 8.0, 8.0, 9.0, 9.0, 9..."
3,3878,"[30.0, 8.0, 8.0, 7.0, 8.0, 11.0, 21.0, 22.0, 2..."
4,3097,"[46.0, 13.0, 9.0, 7.0, 7.0, 7.0, 13.0, 12.0, 1..."
...,...,...
4500,1458,"[85.0, 17.0, 11.0, 10.0, 10.0, 12.0, 31.0, 35...."
4501,3806,"[93.0, 22.0, 18.0, 17.0, 15.0, 15.0, 35.0, 36...."
4502,3898,"[79.0, 7.0, 6.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0,..."
4503,1500,"[78.0, 7.0, 7.0, 6.0, 6.0, 6.0, 11.0, 11.0, 12..."


In [4]:
import os
import numpy as np
import pandas as pd

outdir = base_dir/ "structure" / "s_embeddings"
outdir.mkdir(parents=True, exist_ok=True)

embeddings = np.array(embeddings_df['structural_embedding'].tolist())  
drug_names = np.array(embeddings_df['drug_name'].tolist(), dtype=str)  

np.save(os.path.join(outdir, "mol_image_embeddings.npy"), embeddings)
np.save(os.path.join(outdir, "mol_names.npy"), drug_names)

print("Saved embeddings and drug names successfully.")

Saved embeddings and drug names successfully.


### Protein Target Topological Features

In [26]:
import os
from PIL import Image
import numpy as np
import pandas as pd
from gtda.homology import CubicalPersistence
from gtda.diagrams import BettiCurve, PersistenceLandscape
from tqdm import tqdm


base_dir = Path('Path_to_your_directory')
folder_path = base_dir / "images" / "esm_contacts"


cubical_persistence = CubicalPersistence(n_jobs=-1)
BC = BettiCurve(n_bins=50, n_jobs=-1)
PL = PersistenceLandscape(n_bins=100, n_jobs=-1)


contact_data = []


for filename in tqdm(os.listdir(folder_path), desc="Processing images"):
    if filename.endswith(".png"):
        image_path = os.path.join(folder_path, filename)

       
        drug_name = os.path.splitext(filename)[0]

        
        image = Image.open(image_path)
        channels = {
            "gray": np.array(image.convert("L"))[None, :, :],
            "red": np.array(image)[:, :, 0][None, :, :],
            "green": np.array(image)[:, :, 1][None, :, :],
            "blue": np.array(image)[:, :, 2][None, :, :]
        }

        
        all_features = {}
        
        for channel_name, channel_data in channels.items():
            persistence_diagrams = cubical_persistence.fit_transform(channel_data)

            # BettiCurve 
            bc_features = BC.fit_transform(persistence_diagrams)
            bc_features_flatten = np.reshape(bc_features, -1)

            # PersistenceLandscape 
            pl_features = PL.fit_transform(persistence_diagrams)
            pl_features_flatten = np.reshape(pl_features, -1)

           
            combined_features = np.concatenate([bc_features_flatten, pl_features_flatten])
            all_features[channel_name] = combined_features

        
        final_features = np.concatenate([
            all_features["gray"],
            all_features["red"],
            all_features["green"],
            all_features["blue"]
        ])

        
        contact_data.append({"protein_index": drug_name, "protein_structural_embedding": final_features})


Processing images: 100%|██████████████████| 2002/2002 [1:13:42<00:00,  2.21s/it]


In [27]:
contact_embeddings_df = pd.DataFrame(contact_data)
contact_embeddings_df

Unnamed: 0,protein_index,protein_structural_embedding
0,34,"[68.0, 68.0, 68.0, 68.0, 68.0, 68.0, 2925.0, 2..."
1,987,"[104.0, 104.0, 1284.0, 1284.0, 680.0, 680.0, 6..."
2,1606,"[78.0, 786.0, 624.0, 624.0, 248.0, 154.0, 108...."
3,1466,"[155.0, 548.0, 310.0, 310.0, 104.0, 54.0, 26.0..."
4,1224,"[70.0, 70.0, 70.0, 588.0, 588.0, 384.0, 384.0,..."
...,...,...
1996,319,"[66.0, 66.0, 1166.0, 450.0, 450.0, 450.0, 204...."
1997,1566,"[225.0, 225.0, 340.0, 206.0, 206.0, 206.0, 54...."
1998,1146,"[110.0, 110.0, 1456.0, 1456.0, 492.0, 492.0, 4..."
1999,1458,"[90.0, 618.0, 286.0, 286.0, 84.0, 62.0, 46.0, ..."


In [28]:
import os
import numpy as np
import pandas as pd

outdir = base_dir/ "structure" / "s_embeddings"
outdir.mkdir(parents=True, exist_ok=True)

contact_embeddings = np.array(contact_embeddings_df['protein_structural_embedding'].tolist())  
protein_names = np.array(contact_embeddings_df['protein_index'].tolist(), dtype=str)  

np.save(os.path.join(outdir, "protein_contact_embeddings.npy"), contact_embeddings)
np.save(os.path.join(outdir, "protein_index.npy"), protein_names)

print("Saved embeddings and protein names successfully.")

Saved embeddings and protein names successfully.
