In [1]:
# Cell 1: Install and Import Required Packages
# Install required packages
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

try:
    from transformers import AutoTokenizer, AutoModel
    import torch
    import pandas as pd
    import numpy as np
except ImportError:
    print("Installing required packages...")
    install_package("transformers")
    install_package("torch")
    install_package("pandas")
    install_package("numpy")

    from transformers import AutoTokenizer, AutoModel
    import torch
    import pandas as pd
    import numpy as np

from datetime import datetime
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("All packages imported successfully!")

All packages imported successfully!


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Cell 2: Load and Examine the Dataset
import pandas as pd
import numpy as np

# Load the parquet file
data_path = "/content/drive/MyDrive/scope_onside_common_v3.parquet/scope_onside_common_v3.parquet"
df = pd.read_parquet(data_path)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst few rows:")
print(df.head())

# Check if 'smiles' column exists
if 'smiles' in df.columns:
    print(f"\nSMILES column found! Number of SMILES: {len(df['smiles'])}")
    print("Sample SMILES:")
    print(df['smiles'].head(10).tolist())

    # Check for any null or invalid SMILES
    null_smiles = df['smiles'].isnull().sum()
    print(f"\nNull SMILES: {null_smiles}")

    # Get unique SMILES
    unique_smiles_df = df[['drug_chembl_id', 'smiles']].drop_duplicates(subset=['smiles'])
    print(f"Unique SMILES: {len(unique_smiles_df)}")
else:
    print("\nAvailable columns:", df.columns.tolist())
    print("Please check which column contains the SMILES data")

Dataset shape: (34741, 7)
Columns: ['drug_chembl_id', 'target_uniprot_id', 'label', 'smiles', 'sequence', 'molfile_3d', 'rxcui']

First few rows:
  drug_chembl_id target_uniprot_id  label  \
0     CHEMBL1000            O15245      0   
1     CHEMBL1000            P08183      1   
2     CHEMBL1000            P35367      1   
3     CHEMBL1000            Q02763      0   
4     CHEMBL1000            Q12809      0   

                                        smiles  \
0  O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1   
1  O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1   
2  O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1   
3  O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1   
4  O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1   

                                            sequence  \
0  MPTVDDILEQVGESGWFQKQAFLILCLLSAAFAPICVGIVFLGFTP...   
1  MDLEGDRNGGAKKKNFFKLNNKSEKDKKEKKPTVSVFSMFRYSNWL...   
2  MSLPNSSCLLEDKMCEGNKTTMASPQLMPLVVVLSTICLVTVGLNL...   
3  MDSLASLVLCGVSLLLSGTVEGAMDLILINSLPLVSDAETSLTCIA...   
4 

In [4]:
# Cell 3: Initialize ChemBERTa Model and Tokenizer
from transformers import AutoTokenizer, AutoModel
import torch

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize ChemBERTa tokenizer and model
model_name = "DeepChem/ChemBERTa-77M-MLM"
print(f"Loading ChemBERTa model: {model_name}")

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Move model to device and set to evaluation mode
    model = model.to(device)
    model.eval()

    print("✅ ChemBERTa model loaded successfully!")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"Max sequence length: {tokenizer.model_max_length}")

except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Trying alternative model...")

    # Fallback to smaller model if needed
    model_name = "DeepChem/ChemBERTa-10M-MLM"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model = model.to(device)
    model.eval()
    print(f"✅ Loaded alternative model: {model_name}")

Using device: cuda
Loading ChemBERTa model: DeepChem/ChemBERTa-77M-MLM


tokenizer_config.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/13.7M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ ChemBERTa model loaded successfully!
Model parameters: 3,427,440
Max sequence length: 512


In [5]:
# Cell 4: Extract Unique SMILES for Encoding
# Extract unique SMILES strings and their IDs
print("Extracting unique SMILES strings...")

# Get unique SMILES and their corresponding drug_chembl_ids
unique_smiles_df = df[['drug_chembl_id', 'smiles']].drop_duplicates(subset=['smiles'])
print(f"Total unique SMILES: {len(unique_smiles_df)}")

# Extract lists for encoding
smiles_list = unique_smiles_df['smiles'].tolist()
drug_ids = unique_smiles_df['drug_chembl_id'].tolist()

print(f"SMILES to encode: {len(smiles_list)}")
print("Sample SMILES:")
for i, smi in enumerate(smiles_list[:5]):
    print(f"  {i}: {smi}")

# Check SMILES length distribution
smiles_lengths = [len(smi) for smi in smiles_list]
print(f"\nSMILES length statistics:")
print(f"Mean length: {np.mean(smiles_lengths):.1f}")
print(f"Max length: {max(smiles_lengths)}")
print(f"Min length: {min(smiles_lengths)}")
print(f"Std length: {np.std(smiles_lengths):.1f}")

Extracting unique SMILES strings...
Total unique SMILES: 1028
SMILES to encode: 1028
Sample SMILES:
  0: O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1
  1: CC(C)(C)NC[C@H](O)c1ccc(O)c(CO)c1
  2: CN(C)CCOC(C)(c1ccccc1)c1ccccn1
  3: CCC(=O)N(c1ccccc1)C1(C(=O)OC)CCN(CCC(=O)OC)CC1
  4: N[C@@H](Cc1ccc(O)c(O)c1)C(=O)O

SMILES length statistics:
Mean length: 57.8
Max length: 404
Min length: 1
Std length: 39.7


In [6]:
# Cell 5: Define ChemBERTa Encoding Function
@torch.no_grad()
def encode_smiles_chemberta(smiles_list, tokenizer, model, device, batch_size=32, max_length=512):
    """
    Encode SMILES strings using ChemBERTa model.

    Args:
        smiles_list: List of SMILES strings
        tokenizer: ChemBERTa tokenizer
        model: ChemBERTa model
        device: Device to run on
        batch_size: Batch size for processing
        max_length: Maximum sequence length

    Returns:
        embeddings: numpy array of embeddings
        valid_indices: indices of successfully processed SMILES
    """
    model.eval()
    embeddings = []
    valid_indices = []

    print(f"Processing {len(smiles_list)} SMILES in batches of {batch_size}...")

    for i in tqdm(range(0, len(smiles_list), batch_size), desc="Encoding SMILES"):
        batch_smiles = smiles_list[i:i+batch_size]
        batch_indices = list(range(i, min(i+batch_size, len(smiles_list))))

        try:
            # Tokenize batch
            inputs = tokenizer(
                batch_smiles,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_length,
                add_special_tokens=True
            )

            # Move to device
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Get embeddings
            with torch.no_grad():
                outputs = model(**inputs)

                # Use mean pooling of last hidden states
                # Mask padding tokens
                attention_mask = inputs['attention_mask']
                token_embeddings = outputs.last_hidden_state

                # Mean pooling with attention mask
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
                sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
                sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
                batch_embeddings = sum_embeddings / sum_mask

                # Convert to numpy and store
                batch_embeddings = batch_embeddings.cpu().numpy()
                embeddings.extend(batch_embeddings)
                valid_indices.extend(batch_indices)

        except Exception as e:
            print(f"Error processing batch {i//batch_size + 1}: {e}")
            # Skip this batch
            continue

    return np.array(embeddings), valid_indices

print("ChemBERTa encoding function defined successfully!")

ChemBERTa encoding function defined successfully!


In [7]:
# Cell 6: Encode SMILES using ChemBERTa
from datetime import datetime

print(f"Starting ChemBERTa encoding at {datetime.now().strftime('%H:%M:%S')}")
print(f"Encoding {len(smiles_list)} unique SMILES strings...")

# Encoding parameters
batch_size = 16  # Adjust based on your GPU memory (16-32 for most GPUs)
max_length = 512  # ChemBERTa's maximum sequence length

# Encode all SMILES
embeddings, valid_indices = encode_smiles_chemberta(
    smiles_list=smiles_list,
    tokenizer=tokenizer,
    model=model,
    device=device,
    batch_size=batch_size,
    max_length=max_length
)

print(f"Encoding completed at {datetime.now().strftime('%H:%M:%S')}")
print(f"Successfully encoded: {len(embeddings)} out of {len(smiles_list)} SMILES")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Sample embedding (first 10 dimensions): {embeddings[0][:10]}")

# Filter out failed encodings
successful_smiles = [smiles_list[i] for i in valid_indices]
successful_drug_ids = [drug_ids[i] for i in valid_indices]

print(f"Success rate: {len(embeddings)/len(smiles_list)*100:.1f}%")

Starting ChemBERTa encoding at 21:14:26
Encoding 1028 unique SMILES strings...
Processing 1028 SMILES in batches of 16...


Encoding SMILES: 100%|██████████| 65/65 [00:01<00:00, 57.91it/s]

Encoding completed at 21:14:27
Successfully encoded: 1028 out of 1028 SMILES
Embeddings shape: (1028, 384)
Sample embedding (first 10 dimensions): [ 0.00867808 -0.08597195 -0.026606    0.11053231  0.07518024  0.03893149
  0.11562435  0.13709083  0.06989599  0.07896728]
Success rate: 100.0%





In [8]:
# Cell 7: Save ChemBERTa Embeddings to Parquet
output_path = "smiles_embeddings_chemberta.parquet"

# Create DataFrame with embeddings
embeddings_df = pd.DataFrame({
    'drug_chembl_id': successful_drug_ids,
    'smiles': successful_smiles,
    'embedding': [emb.tolist() for emb in embeddings],  # Convert numpy arrays to lists
    'embedding_dim': [embeddings.shape[1]] * len(embeddings),
    'model_name': [model_name] * len(embeddings)
})

# Save to parquet
embeddings_df.to_parquet(output_path, index=False)

print(f"✅ ChemBERTa embeddings saved to: {output_path}")
print(f"Total embeddings saved: {len(embeddings_df)}")
print(f"Embedding dimension: {embeddings.shape[1]}")
print(f"Model used: {model_name}")

print("\nSaved DataFrame info:")
print(embeddings_df.info())
print("\nSample saved data:")
print(embeddings_df.head(3))

# Memory cleanup
del embeddings
torch.cuda.empty_cache() if torch.cuda.is_available() else None
print("\n🧹 Memory cleaned up!")

✅ ChemBERTa embeddings saved to: smiles_embeddings_chemberta.parquet
Total embeddings saved: 1028
Embedding dimension: 384
Model used: DeepChem/ChemBERTa-77M-MLM

Saved DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028 entries, 0 to 1027
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   drug_chembl_id  1028 non-null   object
 1   smiles          1028 non-null   object
 2   embedding       1028 non-null   object
 3   embedding_dim   1028 non-null   int64 
 4   model_name      1028 non-null   object
dtypes: int64(1), object(4)
memory usage: 40.3+ KB
None

Sample saved data:
  drug_chembl_id                                       smiles  \
0     CHEMBL1000  O=C(O)COCCN1CCN(C(c2ccccc2)c2ccc(Cl)cc2)CC1   
1     CHEMBL1002            CC(C)(C)NC[C@H](O)c1ccc(O)c(CO)c1   
2     CHEMBL1004               CN(C)CCOC(C)(c1ccccc1)c1ccccn1   

                                           embedding  embedding_