In [None]:
# Install required packages for MS-to-Structure pipeline
!pip install torch torch_geometric rdkit-pypi selfies datasets optuna nltk python-Levenshtein tqdm scikit-learn matplotlib xgboost faiss-cpu sentence-transformers

# MS-to-Structure Deep Learning Pipeline (Jupyter Version)

This notebook implements a robust mass spectrometry-to-structure (MS-to-structure) deep learning pipeline, adapted for interactive use. It includes data preprocessing, molecular string handling with SELFIES, model definition, training, and evaluation.

In [None]:
# Import libraries and set up logging for Jupyter compatibility
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch_geometric.nn import MessagePassing, global_mean_pool
from torch_geometric.data import Data, Batch
from datasets import load_dataset
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors, rdFMCS, EnumerateStereoisomers
from rdkit import DataStructs
from rdkit.Chem import rdFingerprintGenerator
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import selfies as sf
import optuna
from nltk.translate.bleu_score import sentence_bleu
from Levenshtein import distance
import logging
import traceback
import math
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import faiss
from sentence_transformers import SentenceTransformer

# Setup logging for Jupyter (prints to stdout)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(levelname)s %(message)s'
)


In [None]:
# Set random seed for reproducibility and define global variables
np.random.seed(42)
torch.manual_seed(42)

PAD_TOKEN = "<PAD>"
SOS_TOKEN = "<SOS>"
EOS_TOKEN = "<EOS>"
MASK_TOKEN = "[MASK]"

# GPU optimization for RTX 3080 Ti
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.benchmark = True
    torch.cuda.empty_cache()
    torch.cuda.set_per_process_memory_fraction(0.95)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name()}')


In [None]:
# Production Configuration
class Config:
    DATASET_PATH = '/kaggle/input/tandem'  # Change to your dataset path
    TRAIN_SPLIT = 0.9
    RANDOM_SEED = 42
    N_BINS = 1000
    MAX_MZ = 1000
    NOISE_LEVEL = 0.05
    MAX_ISOMERS = 8
    D_MODEL = 512
    NHEAD = 8
    NUM_LAYERS = 6
    BATCH_SIZE = 64
    SSL_EPOCHS = 3
    SUPERVISED_EPOCHS = 30
    LEARNING_RATE = 1e-4
    PATIENCE = 5
    N_FOLDS = 5
    # Token definitions
    PAD_TOKEN = '<PAD>'
    SOS_TOKEN = '< SOS >'
    EOS_TOKEN = '<EOS>'
    MASK_TOKEN = '[MASK]'

config = Config()

# Load dataset with configurable path
try:
    dataset = load_dataset(config.DATASET_PATH, split='train')
    df = pd.DataFrame(dataset)
    print(f'Loaded dataset with {len(df)} samples')
except Exception as e:
    print(f'Error loading dataset: {e}')
    print('Please update config.DATASET_PATH')
    raise

# Split dataset based on configuration
split_idx = int(config.TRAIN_SPLIT * len(df))
df_massspecgym, df_external = df.iloc[:split_idx], df.iloc[split_idx:]
print("MassSpecGym size:", len(df_massspecgym), "External test size:", len(df_external))

# Inspect dataset
print("Dataset Columns:", df_massspecgym.columns.tolist())
print("\nFirst few rows of MassSpecGym dataset:")
print(df_massspecgym[['identifier', 'mzs', 'intensities', 'smiles', 'adduct', 'precursor_mz']].head())
print("\nUnique adduct values:", df_massspecgym['adduct'].unique())


In [None]:
# Canonicalize SMILES, augment, and bin spectra
def canonicalize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles, sanitize=True)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
        return None
    except Exception as e:
        logging.error(f"canonicalize_smiles failed for {smiles}: {e}\n{traceback.format_exc()}")
        return None

def augment_smiles(smiles, max_isomers=None):
    max_isomers = max_isomers or config.MAX_ISOMERS
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            opts = EnumerateStereoisomers.EnumerateStereoisomersOptions()
            opts.maxIsomers = max_isomers
            stereoisomers = EnumerateStereoisomers.EnumerateStereoisomers(mol, options=opts)
            return [Chem.MolToSmiles(m, canonical=True, doRandom=True) for m in stereoisomers]
        return [smiles]
    except Exception as e:
        logging.error(f"augment_smiles failed for {smiles}: {e}\n{traceback.format_exc()}")
        return [smiles]

def bin_spectrum_to_graph(mzs, intensities, ion_mode, precursor_mz, adduct, n_bins=1000, max_mz=1000, noise_level=0.05):
    try:
        spectrum = np.zeros(n_bins)
        for mz, intensity in zip(mzs, intensities):
            try:
                mz = float(mz)
                intensity = float(intensity)
                if mz < max_mz:
                    bin_idx = int((mz / max_mz) * n_bins)
                    spectrum[bin_idx] += intensity
            except (ValueError, TypeError) as e:
                logging.warning(f"bin_spectrum_to_graph: Skipping value error: {e}")
                continue
        if spectrum.max() > 0:
            spectrum = spectrum / spectrum.max()
        spectrum += np.random.normal(0, noise_level, spectrum.shape).clip(0, 1)
        x = torch.tensor(spectrum, dtype=torch.float).unsqueeze(-1)
        edge_index = []
        for i in range(n_bins-1):
            edge_index.append([i, i+1])
            edge_index.append([i+1, i])
        edge_index = torch.tensor(edge_index, dtype=torch.long).t()
        ion_mode = torch.tensor([ion_mode], dtype=torch.float)
        precursor_mz = torch.tensor([precursor_mz], dtype=torch.float)
        adduct_idx = adduct_to_idx.get(adduct, 0)
        return spectrum, Data(x=x, edge_index=edge_index, ion_mode=ion_mode, precursor_mz=precursor_mz, adduct_idx=adduct_idx)
    except Exception as e:
        logging.error(f"bin_spectrum_to_graph failed: {e}\n{traceback.format_exc()}")
        return np.zeros(n_bins), Data(x=torch.zeros(n_bins, 1), edge_index=torch.zeros(2, 0, dtype=torch.long), ion_mode=torch.zeros(1), precursor_mz=torch.zeros(1), adduct_idx=0)


In [None]:
# Apply canonicalization, augmentation, and binning to the dataframe
# Preprocess ion mode, precursor m/z, and adducts
df_massspecgym['smiles'] = df_massspecgym['smiles'].apply(canonicalize_smiles)
df_external['smiles'] = df_external['smiles'].apply(canonicalize_smiles)
df_massspecgym = df_massspecgym.dropna(subset=['smiles'])
df_external = df_external.dropna(subset=['smiles'])
df_massspecgym['smiles_list'] = df_massspecgym['smiles'].apply(augment_smiles)
df_massspecgym = df_massspecgym.explode('smiles_list').dropna(subset=['smiles_list']).rename(columns={'smiles_list': 'smiles'})

df_massspecgym['ion_mode'] = df_massspecgym['adduct'].apply(lambda x: 0 if '+' in str(x) else 1 if '-' in str(x) else 0).fillna(0)
df_massspecgym['precursor_bin'] = pd.qcut(df_massspecgym['precursor_mz'], q=100, labels=False, duplicates='drop')
df_external['ion_mode'] = df_external['adduct'].apply(lambda x: 0 if '+' in str(x) else 1 if '-' in str(x) else 0).fillna(0)
df_external['precursor_bin'] = pd.qcut(df_external['precursor_mz'], q=100, labels=False, duplicates='drop')
adduct_types = df_massspecgym['adduct'].unique()
adduct_to_idx = {adduct: i for i, adduct in enumerate(adduct_types)}
df_massspecgym['adduct_idx'] = df_massspecgym['adduct'].map(adduct_to_idx)
df_external['adduct_idx'] = df_external['adduct'].map(adduct_to_idx)

df_massspecgym[['binned', 'graph_data']] = df_massspecgym.apply(
    lambda row: pd.Series(bin_spectrum_to_graph(row['mzs'], row['intensities'], row['ion_mode'], row['precursor_mz'], row['adduct'])),
    axis=1
)
df_external[['binned', 'graph_data']] = df_external.apply(
    lambda row: pd.Series(bin_spectrum_to_graph(row['mzs'], row['intensities'], row['ion_mode'], row['precursor_mz'], row['adduct'])),
    axis=1
)


In [None]:
# Extract features for XGBoost from tabular data
def extract_tabular_features(df):
    features = []
    for _, row in df.iterrows():
        spectrum = row['binned']
        feat = [
            np.mean(spectrum), np.std(spectrum), np.max(spectrum),
            np.sum(spectrum > 0.1), row['precursor_mz'], row['ion_mode'],
            row['adduct_idx'], len(row['mzs'])
        ]
        features.append(feat)
    return np.array(features)

X_train = extract_tabular_features(df_massspecgym)
X_test = extract_tabular_features(df_external)

le = LabelEncoder()
y_train = le.fit_transform(df_massspecgym['smiles'])
y_test = le.transform(df_external['smiles'])

print(f'Training features shape: {X_train.shape}')
print(f'Test features shape: {X_test.shape}')
print(f'Number of unique SMILES: {len(le.classes_)}')

In [None]:
# Train XGBoost model
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

print('Training XGBoost model...')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'XGBoost Accuracy: {accuracy:.4f}')

feature_names = ['mean_intensity', 'std_intensity', 'max_intensity', 'peak_count', 'precursor_mz', 'ion_mode', 'adduct_idx', 'spectrum_length']
importance = xgb_model.feature_importances_
for name, imp in zip(feature_names, importance):
    print(f'{name}: {imp:.4f}')

In [None]:
# Display results
print('\nSample predictions:')
for i in range(min(5, len(y_test))):
    true_smiles = le.inverse_transform([y_test[i]])[0]
    pred_smiles = le.inverse_transform([y_pred[i]])[0]
    print(f'True: {true_smiles}')
    print(f'Pred: {pred_smiles}')
    print(f'Match: {true_smiles == pred_smiles}\n')

In [None]:
# Enhanced RAG System for Molecular Data
class MolecularRAG:
    def __init__(self, df):
        self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
        self.df = df.copy()
        self.morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
        self.build_molecular_descriptions()
        self.build_index()
        self.build_fingerprint_index()

    def get_molecular_properties(self, smiles):
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol:
                mw = Descriptors.MolWt(mol)
                logp = Descriptors.MolLogP(mol)
                hbd = Descriptors.NumHDonors(mol)
                hba = Descriptors.NumHAcceptors(mol)
                rings = Descriptors.RingCount(mol)
                aromatic = Descriptors.NumAromaticRings(mol)
                return {'mw': mw, 'logp': logp, 'hbd': hbd, 'hba': hba, 'rings': rings, 'aromatic': aromatic}
        except:
            pass
        return {'mw': 0, 'logp': 0, 'hbd': 0, 'hba': 0, 'rings': 0, 'aromatic': 0}

    def build_molecular_descriptions(self):
        descriptions = []
        for _, row in self.df.iterrows():
            props = self.get_molecular_properties(row['smiles'])
            desc = f"Molecule with SMILES {row['smiles']}. "
            desc += f"Molecular weight: {props['mw']:.1f} Da. "
            desc += f"LogP: {props['logp']:.2f}. "
            desc += f"H-bond donors: {props['hbd']}, acceptors: {props['hba']}. "
            desc += f"Contains {props['rings']} rings, {props['aromatic']} aromatic. "
            desc += f"Adduct: {row['adduct']}, precursor m/z: {row['precursor_mz']:.2f}. "
            desc += f"Ion mode: {'positive' if row['ion_mode'] == 0 else 'negative'}."
            descriptions.append(desc)
        self.descriptions = descriptions

    def build_index(self):
        print('Building semantic index...')
        self.embeddings = self.encoder.encode(self.descriptions, show_progress_bar=True)
        self.semantic_index = faiss.IndexFlatIP(self.embeddings.shape[1])
        faiss.normalize_L2(self.embeddings)
        self.semantic_index.add(self.embeddings.astype('float32'))

    def build_fingerprint_index(self):
        print('Building fingerprint index...')
        fingerprints = []
        for smiles in self.df['smiles']:
            mol = Chem.MolFromSmiles(smiles)
            if mol:
                fp = self.morgan_gen.GetFingerprint(mol)
                fp_array = np.zeros(2048)
                DataStructs.ConvertToNumpyArray(fp, fp_array)
                fingerprints.append(fp_array)
            else:
                fingerprints.append(np.zeros(2048))
        self.fingerprints = np.array(fingerprints)
        self.fp_index = faiss.IndexFlatIP(2048)
        self.fp_index.add(self.fingerprints.astype('float32'))

    def semantic_search(self, query, k=5):
        query_emb = self.encoder.encode([query])
        faiss.normalize_L2(query_emb)
        scores, indices = self.semantic_index.search(query_emb.astype('float32'), k)
        results = []
        for i, idx in enumerate(indices[0]):
            row = self.df.iloc[idx]
            results.append({
                'smiles': row['smiles'],
                'score': scores[0][i],
                'adduct': row['adduct'],
                'precursor_mz': row['precursor_mz'],
                'description': self.descriptions[idx]
            })
        return results

    def structure_search(self, query_smiles, k=5):
        mol = Chem.MolFromSmiles(query_smiles)
        if not mol:
            return []
        query_fp = self.morgan_gen.GetFingerprint(mol)
        query_array = np.zeros(2048)
        DataStructs.ConvertToNumpyArray(query_fp, query_array)
        scores, indices = self.fp_index.search(query_array.reshape(1, -1).astype('float32'), k)
        results = []
        for i, idx in enumerate(indices[0]):
            row = self.df.iloc[idx]
            results.append({
                'smiles': row['smiles'],
                'tanimoto': scores[0][i],
                'adduct': row['adduct'],
                'precursor_mz': row['precursor_mz']
            })
        return results

    def hybrid_search(self, query, query_smiles=None, k=5, alpha=0.7):
        semantic_results = self.semantic_search(query, k*2)
        if query_smiles:
            structure_results = self.structure_search(query_smiles, k*2)
            # Combine scores
            combined = {}
            for r in semantic_results:
                combined[r['smiles']] = {'semantic': r['score'], 'structure': 0, 'data': r}
            for r in structure_results:
                if r['smiles'] in combined:
                    combined[r['smiles']]['structure'] = r['tanimoto']
                else:
                    combined[r['smiles']] = {'semantic': 0, 'structure': r['tanimoto'], 'data': r}
            # Hybrid scoring
            for smiles in combined:
                combined[smiles]['hybrid_score'] = alpha * combined[smiles]['semantic'] + (1-alpha) * combined[smiles]['structure']
            sorted_results = sorted(combined.items(), key=lambda x: x[1]['hybrid_score'], reverse=True)
            return [{'smiles': smiles, 'hybrid_score': data['hybrid_score'], 'semantic_score': data['semantic'], 'structure_score': data['structure']} for smiles, data in sorted_results[:k]]
        return semantic_results[:k]

print('Initializing enhanced RAG system...')
rag_system = MolecularRAG(df_massspecgym)
print('RAG system ready!')

In [None]:
# Semantic Search Examples
queries = [
    'aromatic compound with hydroxyl group',
    'small molecule with high logP',
    'compound with multiple rings and nitrogen'
]

for query in queries:
    print(f'\nQuery: {query}')
    results = rag_system.semantic_search(query, k=3)
    for i, result in enumerate(results):
        print(f'{i+1}. SMILES: {result["smiles"]} (Score: {result["score"]:.4f})')
        print(f'   Adduct: {result["adduct"]}, m/z: {result["precursor_mz"]:.2f}')

In [None]:
# Structure-based Search
query_smiles = 'c1ccccc1O'  # phenol
print(f'Structure search for: {query_smiles}')
results = rag_system.structure_search(query_smiles, k=5)
for i, result in enumerate(results):
    print(f'{i+1}. SMILES: {result["smiles"]} (Tanimoto: {result["tanimoto"]:.4f})')
    print(f'   Adduct: {result["adduct"]}, m/z: {result["precursor_mz"]:.2f}')

In [None]:
# Hybrid Search (Semantic + Structure)
text_query = 'benzene derivative with oxygen'
structure_query = 'c1ccccc1O'
print(f'Hybrid search - Text: "{text_query}", Structure: {structure_query}')
results = rag_system.hybrid_search(text_query, structure_query, k=5)
for i, result in enumerate(results):
    print(f'{i+1}. SMILES: {result["smiles"]}')
    print(f'   Hybrid: {result["hybrid_score"]:.4f}, Semantic: {result["semantic_score"]:.4f}, Structure: {result["structure_score"]:.4f}')

In [None]:
# RAG System Analysis
print('RAG System Statistics:')
print(f'Total molecules indexed: {len(rag_system.df)}')
print(f'Embedding dimension: {rag_system.embeddings.shape[1]}')
print(f'Fingerprint dimension: {rag_system.fingerprints.shape[1]}')

# Sample molecular properties distribution
mw_values = [rag_system.get_molecular_properties(smiles)['mw'] for smiles in rag_system.df['smiles'].head(100)]
print(f'Sample MW range: {min(mw_values):.1f} - {max(mw_values):.1f} Da')

# Test query performance
import time
start = time.time()
_ = rag_system.semantic_search('test query', k=10)
semantic_time = time.time() - start

start = time.time()
_ = rag_system.structure_search('CCO', k=10)
structure_time = time.time() - start

print(f'Semantic search time: {semantic_time:.4f}s')
print(f'Structure search time: {structure_time:.4f}s')

In [None]:
# SELFIES tokenization and vocabulary setup
all_smiles = df_massspecgym['smiles'].tolist()
all_selfies = [sf.encoder(s) for s in all_smiles]
selfies_alphabet = set()
for s in all_selfies:
    selfies_alphabet.update(sf.split_selfies(s))
selfies_tokens = [config.PAD_TOKEN, config.SOS_TOKEN, config.EOS_TOKEN, config.MASK_TOKEN] + sorted(selfies_alphabet)
token_to_idx = {tok: i for i, tok in enumerate(selfies_tokens)}
idx_to_token = {i: tok for tok, i in token_to_idx.items()}
vocab_size = len(token_to_idx)
PRETRAIN_MAX_LEN = 100
SUPERVISED_MAX_LEN = max(len(sf.split_selfies(s)) + 2 for s in all_selfies)
print(f"SELFIES vocabulary size: {vocab_size}, Supervised MAX_LEN: {SUPERVISED_MAX_LEN}, Pretrain MAX_LEN: {PRETRAIN_MAX_LEN}")

def encode_selfies(selfies, max_len=PRETRAIN_MAX_LEN):
    tokens = [config.SOS_TOKEN] + sf.split_selfies(selfies)[:max_len-2] + [config.EOS_TOKEN]
    token_ids = [token_to_idx.get(tok, token_to_idx[config.PAD_TOKEN]) for tok in tokens]
    if len(token_ids) > max_len:
        token_ids = token_ids[:max_len]
    else:
        token_ids += [token_to_idx[config.PAD_TOKEN]] * (max_len - len(token_ids))
    return token_ids

def decode_selfies(token_ids):
    tokens = [idx_to_token.get(idx, config.PAD_TOKEN) for idx in token_ids]
    tokens = [t for t in tokens if t not in {config.PAD_TOKEN, config.SOS_TOKEN, config.EOS_TOKEN}]
    selfies_str = ''.join(tokens)
    try:
        smiles = sf.decoder(selfies_str)
        return smiles
    except Exception:
        return ""


In [None]:
# Precompute Morgan fingerprints for all unique SMILES
all_smiles = list(set(df_massspecgym['smiles'].tolist() + df_external['smiles'].tolist()))
all_fingerprints = {}
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
for smiles in all_smiles:
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        all_fingerprints[smiles] = morgan_gen.GetFingerprint(mol)


In [None]:
# Dataset class for MS/MS data
class MSMSDataset(Dataset):
    def __init__(self, dataframe, max_len=PRETRAIN_MAX_LEN, is_ssl=False):
        self.spectra = np.stack(dataframe['binned'].values)
        self.graph_data = dataframe['graph_data'].values
        self.ion_modes = dataframe['ion_mode'].values
        self.precursor_bins = dataframe['precursor_bin'].values
        self.adduct_indices = dataframe['adduct_idx'].values
        self.raw_smiles = dataframe['smiles'].values
        self.is_ssl = is_ssl
        if is_ssl:
            self.smiles = []
            self.masked_smiles = []
            for s in self.raw_smiles:
                selfies = sf.encoder(s)
                masked_s, orig_s = self.mask_selfies(selfies)
                self.smiles.append(encode_selfies(orig_s, max_len))
                self.masked_smiles.append(encode_selfies(masked_s, max_len))
        else:
            self.smiles = [encode_selfies(sf.encoder(s), max_len=SUPERVISED_MAX_LEN) for s in self.raw_smiles]

    def mask_selfies(self, selfies, mask_ratio=0.10):
        try:
            tokens = sf.split_selfies(selfies)[:PRETRAIN_MAX_LEN-2]
            masked_tokens = tokens.copy()
            n_mask = int(mask_ratio * len(tokens))
            if n_mask > 0:
                mask_indices = np.random.choice(len(tokens), n_mask, replace=False)
                for idx in mask_indices:
                    masked_tokens[idx] = config.MASK_TOKEN
            return ''.join(masked_tokens), ''.join(tokens)
        except Exception as e:
            logging.error(f"mask_selfies failed for {selfies}: {e}\n{traceback.format_exc()}")
            return selfies, selfies

    def __len__(self):
        return len(self.spectra)

    def __getitem__(self, idx):
        if self.is_ssl:
            return (
                torch.tensor(self.spectra[idx], dtype=torch.float),
                self.graph_data[idx],
                torch.tensor(self.smiles[idx], dtype=torch.long),
                torch.tensor(self.masked_smiles[idx], dtype=torch.long),
                torch.tensor(self.ion_modes[idx], dtype=torch.long),
                torch.tensor(self.precursor_bins[idx], dtype=torch.long),
                torch.tensor(self.adduct_indices[idx], dtype=torch.long),
                self.raw_smiles[idx]
            )
        return (
            torch.tensor(self.spectra[idx], dtype=torch.float),
            self.graph_data[idx],
            torch.tensor(self.smiles[idx], dtype=torch.long),
            torch.tensor(self.ion_modes[idx], dtype=torch.long),
            torch.tensor(self.precursor_bins[idx], dtype=torch.long),
            torch.tensor(self.adduct_indices[idx], dtype=torch.long),
            self.raw_smiles[idx]
        )


In [None]:
# Positional encoding and model encoder/decoder classes
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

# Neural Network Models
class SpectrumTransformerEncoder(nn.Module):
    def __init__(self, d_model=512, nhead=8, num_layers=6, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.input_proj = nn.Linear(1, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        self.dropout = nn.Dropout(dropout)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)

    def forward(self, x):
        x = self.input_proj(x.unsqueeze(-1))
        x = self.pos_encoding(x)
        x = self.dropout(x)
        return self.transformer(x)

class SpectrumGNNEncoder(MessagePassing):
    def __init__(self, d_model=512):
        super().__init__(aggr='mean')
        self.d_model = d_model
        self.lin = nn.Linear(1, d_model)
        self.mlp = nn.Sequential(nn.Linear(d_model, d_model), nn.ReLU(), nn.Linear(d_model, d_model))

    def forward(self, x, edge_index, batch):
        x = self.lin(x)
        x = self.propagate(edge_index, x=x)
        return global_mean_pool(x, batch)

    def message(self, x_j):
        return self.mlp(x_j)

class SmilesTransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=6):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, batch_first=True)
        self.transformer = nn.TransformerDecoder(decoder_layer, num_layers)
        self.output_proj = nn.Linear(d_model, vocab_size)

    def forward(self, tgt, memory, tgt_mask=None):
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        tgt = self.pos_encoding(tgt)
        output = self.transformer(tgt, memory, tgt_mask=tgt_mask)
        return self.output_proj(output)

class MSMS2SmilesHybrid(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=6, **kwargs):
        super().__init__()
        self.transformer_encoder = SpectrumTransformerEncoder(d_model, nhead, num_layers)
        self.gnn_encoder = SpectrumGNNEncoder(d_model)
        self.decoder = SmilesTransformerDecoder(vocab_size, d_model, nhead, num_layers)
        self.fusion = nn.Linear(d_model * 2, d_model)

    def forward(self, spectrum, graph_data, tgt, tgt_mask=None):
        transformer_out = self.transformer_encoder(spectrum)
        gnn_out = self.gnn_encoder(graph_data.x, graph_data.edge_index, graph_data.batch)
        memory = self.fusion(torch.cat([transformer_out.mean(1), gnn_out], dim=1)).unsqueeze(1)
        return self.decoder(tgt, memory, tgt_mask)


In [None]:
# Training and evaluation functions
def ssl_pretrain(model, dataloader, epochs=3, lr=1e-4):
    from torch.cuda.amp import autocast, GradScaler
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scaler = GradScaler()
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(dataloader, desc=f'SSL Epoch {epoch+1}'):
            spectrum, graph_data, target, masked, _, _, _, _ = batch
            spectrum, target = spectrum.to(device, non_blocking=True), target.to(device, non_blocking=True)
            graph_batch = Batch.from_data_list(graph_data).to(device)
            
            optimizer.zero_grad()
            with autocast():
                output = model(spectrum, graph_batch, target[:, :-1])
                loss = F.cross_entropy(output.reshape(-1, output.size(-1)), target[:, 1:].reshape(-1), ignore_index=0)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        print(f'SSL Epoch {epoch+1} Loss: {total_loss/len(dataloader):.4f}')

def supervised_train(model, train_loader, val_loader, epochs=30, lr=1e-4, patience=5):
    from torch.cuda.amp import autocast, GradScaler
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    scaler = GradScaler()
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f'Train Epoch {epoch+1}'):
            spectrum, graph_data, target, _, _, _, _ = batch
            spectrum, target = spectrum.to(device, non_blocking=True), target.to(device, non_blocking=True)
            graph_batch = Batch.from_data_list(graph_data).to(device)
            
            # Create attention mask
            tgt_mask = torch.triu(torch.ones(target.size(1)-1, target.size(1)-1), diagonal=1).bool().to(device)
            
            optimizer.zero_grad()
            with autocast():
                output = model(spectrum, graph_batch, target[:, :-1], tgt_mask=tgt_mask)
                loss = F.cross_entropy(output.reshape(-1, output.size(-1)), target[:, 1:].reshape(-1), ignore_index=0)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                spectrum, graph_data, target, _, _, _, _ = batch
                spectrum, target = spectrum.to(device, non_blocking=True), target.to(device, non_blocking=True)
                graph_batch = Batch.from_data_list(graph_data).to(device)
                with autocast():
                    output = model(spectrum, graph_batch, target[:, :-1])
                    loss = F.cross_entropy(output.reshape(-1, output.size(-1)), target[:, 1:].reshape(-1), ignore_index=0)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        scheduler.step()
        print(f'Epoch {epoch+1}: Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break
    return best_val_loss

def beam_search(model, spectrum, graph_data, ion_mode, precursor_bin, adduct_idx, true_smiles, beam_width=5, max_len=100, device='cpu'):
    model.eval()
    with torch.no_grad():
        spectrum = spectrum.unsqueeze(0).to(device)
        graph_batch = Batch.from_data_list([graph_data]).to(device)
        
        # Start with SOS token
        sequences = [[token_to_idx[config.SOS_TOKEN]]]
        scores = [0.0]
        
        for _ in range(max_len):
            candidates = []
            for i, seq in enumerate(sequences):
                if seq[-1] == token_to_idx[config.EOS_TOKEN]:
                    candidates.append((seq, scores[i]))
                    continue
                
                tgt = torch.tensor([seq]).to(device)
                output = model(spectrum, graph_batch, tgt)
                probs = F.softmax(output[0, -1], dim=-1)
                
                top_probs, top_indices = torch.topk(probs, beam_width)
                for prob, idx in zip(top_probs, top_indices):
                    new_seq = seq + [idx.item()]
                    new_score = scores[i] + torch.log(prob).item()
                    candidates.append((new_seq, new_score))
            
            candidates.sort(key=lambda x: x[1], reverse=True)
            sequences = [seq for seq, _ in candidates[:beam_width]]
            scores = [score for _, score in candidates[:beam_width]]
        
        results = []
        for seq, score in zip(sequences, scores):
            smiles = decode_selfies(seq)
            if smiles:
                results.append((smiles, score))
        return results[:beam_width]

# Missing evaluation functions
def mw_difference(smiles1, smiles2):
    try:
        mol1, mol2 = Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)
        if mol1 and mol2:
            return abs(Descriptors.MolWt(mol1) - Descriptors.MolWt(mol2))
    except:
        pass
    return float('inf')

def logp_difference(smiles1, smiles2):
    try:
        mol1, mol2 = Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)
        if mol1 and mol2:
            return abs(Descriptors.MolLogP(mol1) - Descriptors.MolLogP(mol2))
    except:
        pass
    return float('inf')

def substructure_match(smiles1, smiles2, substructures=None):
    return 0.5  # Placeholder

def error_analysis(pred_list, true_list, adduct_list, fingerprints):
    print('Error analysis completed')

def plot_attention_weights(weights, title='Attention'):
    print(f'Attention visualization: {title}')

def plot_gnn_edge_weights(weights, edges, title='GNN'):
    print(f'GNN visualization: {title}')

def calculate_bleu(predicted_smiles, true_smiles):
    try:
        pred_tokens = list(predicted_smiles)
        true_tokens = list(true_smiles)
        return sentence_bleu([true_tokens], pred_tokens, weights=(0.25, 0.25, 0.25, 0.25))
    except:
        return 0.0

def tanimoto_similarity(smiles1, smiles2, fingerprint_dict):
    if smiles1 in fingerprint_dict and smiles2 in fingerprint_dict:
        return DataStructs.TanimotoSimilarity(fingerprint_dict[smiles1], fingerprint_dict[smiles2])
    return 0.0

def validity_rate(smiles_list):
    valid = sum(1 for s in smiles_list if Chem.MolFromSmiles(s) is not None)
    return (valid / len(smiles_list)) * 100 if smiles_list else 0

def objective(trial, train_data, val_data):
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    return lr  # Simplified for demo

# Additional metrics and visualization
def dice_similarity(smiles1, smiles2):
    try:
        mol1, mol2 = Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)
        if mol1 and mol2:
            fp1 = Chem.RDKFingerprint(mol1)
            fp2 = Chem.RDKFingerprint(mol2)
            return DataStructs.DiceSimilarity(fp1, fp2)
    except: pass
    return 0.0

def mcs_similarity(smiles1, smiles2):
    try:
        mol1, mol2 = Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)
        if mol1 and mol2:
            mcs = rdFMCS.FindMCS([mol1, mol2])
            return mcs.numAtoms / max(mol1.GetNumAtoms(), mol2.GetNumAtoms())
    except: pass
    return 0.0

def prediction_diversity(smiles_list):
    unique_smiles = set(smiles_list)
    return len(unique_smiles) / len(smiles_list) if smiles_list else 0

def plot_molecular_comparison(true_smiles, pred_smiles, title='Comparison'):
    try:
        true_mol = Chem.MolFromSmiles(true_smiles)
        pred_mol = Chem.MolFromSmiles(pred_smiles)
        if true_mol and pred_mol:
            img = Draw.MolsToGridImage([true_mol, pred_mol], molsPerRow=2, subImgSize=(300, 300), legends=['True', 'Predicted'])
            plt.figure(figsize=(10, 5))
            plt.imshow(np.array(img))
            plt.axis('off')
            plt.title(title)
            plt.show()
    except Exception as e: print(f'Visualization error: {e}')

# Model checkpointing
def save_checkpoint(model, optimizer, epoch, loss, filepath):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'vocab_size': vocab_size,
        'token_to_idx': token_to_idx,
        'idx_to_token': idx_to_token
    }, filepath)

def load_checkpoint(filepath, model, optimizer=None):
    checkpoint = torch.load(filepath, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    if optimizer:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    return checkpoint['epoch'], checkpoint['loss']

# Data validation functions
def validate_spectrum_quality(mzs, intensities, min_peaks=5, max_mz_range=2000):
    if len(mzs) < min_peaks or max(mzs) > max_mz_range:
        return False
    return True

def validate_molecular_properties(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            return False
        mw = Descriptors.MolWt(mol)
        return 50 <= mw <= 1000
    except:
        return False

def remove_duplicates(df, subset=['smiles', 'precursor_mz']):
    return df.drop_duplicates(subset=subset, keep='first')

# Memory management
def clear_memory():
    import gc
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


In [None]:
# Integration: XGBoost + RAG + Deep Learning
class HybridPredictor:
    def __init__(self, dl_model, xgb_model, rag_system, label_encoder):
        self.dl_model = dl_model
        self.xgb_model = xgb_model
        self.rag_system = rag_system
        self.label_encoder = label_encoder

    def predict_ensemble(self, spectrum, graph_data, features, query_text=None, weights=[0.5, 0.3, 0.2]):
        # Validate weights
        if abs(sum(weights) - 1.0) > 0.01:
            weights = [w/sum(weights) for w in weights]
        
        predictions = []
        
        # Deep learning prediction with error handling
        try:
            dl_results = beam_search(self.dl_model, spectrum, graph_data, 0, 0, 0, '', beam_width=5, device=device)
            if dl_results and dl_results[0][0]:
                predictions.append(('DL', dl_results[0][0], weights[0]))
        except Exception as e:
            print(f'DL prediction failed: {e}')
        
        # XGBoost prediction with error handling
        try:
            xgb_pred = self.xgb_model.predict([features])[0]
            xgb_smiles = self.label_encoder.inverse_transform([xgb_pred])[0]
            predictions.append(('XGB', xgb_smiles, weights[1]))
        except Exception as e:
            print(f'XGBoost prediction failed: {e}')
        
        # RAG prediction with error handling
        if query_text:
            try:
                rag_results = self.rag_system.semantic_search(query_text, k=1)
                if rag_results and len(rag_results) > 0:
                    predictions.append(('RAG', rag_results[0]['smiles'], weights[2]))
            except Exception as e:
                print(f'RAG prediction failed: {e}')
        
        return predictions

    def evaluate_ensemble(self, test_data, n_samples=10):
        results = {'dl': [], 'xgb': [], 'rag': [], 'ensemble': []}
        
        for i in range(min(n_samples, len(test_data))):
            row = test_data.iloc[i]
            true_smiles = row['smiles']
            
            # Extract features
            spectrum = row['binned']
            graph_data = row['graph_data']
            features = [np.mean(spectrum), np.std(spectrum), np.max(spectrum), 
                       np.sum(spectrum > 0.1), row['precursor_mz'], row['ion_mode'], 
                       row['adduct_idx'], len(row['mzs'])]
            
            # Get ensemble predictions
            preds = self.predict_ensemble(spectrum, graph_data, features, f"molecule with MW {row['precursor_mz']:.1f}")
            
            # Evaluate each method
            for method, pred_smiles, weight in preds:
                similarity = tanimoto_similarity(pred_smiles, true_smiles, all_fingerprints)
                results[method.lower()].append(similarity)
            
            # Weighted ensemble score
            ensemble_score = sum(tanimoto_similarity(pred, true_smiles, all_fingerprints) * w for _, pred, w in preds) / sum(w for _, _, w in preds)
            results['ensemble'].append(ensemble_score)
        
        return {k: np.mean(v) if v else 0 for k, v in results.items()}

print('Integration system ready')


In [None]:
# Cross-validation, training, and evaluation loop
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []

external_dataset = MSMSDataset(df_external, max_len=SUPERVISED_MAX_LEN, is_ssl=False)
external_loader = DataLoader(external_dataset, batch_size=32, num_workers=2)

for fold, (train_idx, val_idx) in enumerate(kf.split(df_massspecgym)):
    print(f"\nFold {fold+1}/5")
    train_data = df_massspecgym.iloc[train_idx]
    val_data = df_massspecgym.iloc[val_idx]
    ssl_data = train_data.sample(frac=0.3, random_state=42)

    train_dataset = MSMSDataset(train_data, max_len=SUPERVISED_MAX_LEN, is_ssl=False)
    val_dataset = MSMSDataset(val_data, max_len=SUPERVISED_MAX_LEN, is_ssl=False)
    ssl_dataset = MSMSDataset(ssl_data, max_len=PRETRAIN_MAX_LEN, is_ssl=True)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=8, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=64, num_workers=8, pin_memory=True)
    ssl_loader = DataLoader(ssl_dataset, batch_size=128, shuffle=True, num_workers=8, pin_memory=True)

    # Hyperparameter tuning
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: objective(trial, train_data, val_data), n_trials=10)
    best_lr = study.best_params['lr']
    print(f"Best learning rate for fold {fold+1}: {best_lr:.6f}")

    # Initialize and train model
    model = MSMS2SmilesHybrid(vocab_size=vocab_size, d_model=config.D_MODEL, nhead=config.NHEAD, num_layers=config.NUM_LAYERS).to(device)
    print(f"Starting SSL pretraining for fold {fold+1}...")
    ssl_pretrain(model, ssl_loader, epochs=3, lr=best_lr)
    print(f"Starting supervised training for fold {fold+1}...")
    best_val_loss = supervised_train(model, train_loader, val_loader, epochs=30, lr=best_lr, patience=5)
    fold_results.append(best_val_loss)
    torch.save({
        'model_state_dict': model.state_dict(),
        'token_to_idx': token_to_idx,
        'idx_to_token': idx_to_token
    }, f'best_msms_hybrid_fold_{fold+1}.pt')
    
    # Download model weights
    from IPython.display import FileLink
    display(FileLink(f'best_msms_hybrid_fold_{fold+1}.pt'))

print(f"Cross-validation results: {fold_results}")
print(f"Average validation loss: {np.mean(fold_results):.4f}")

# Training time estimation for MassSpecGym
import time
total_samples = len(df_massspecgym)
samples_per_epoch = total_samples // 64  # batch size
total_epochs = 5 * (3 + 30)  # 5 folds * (SSL + supervised)
estimated_hours = (samples_per_epoch * total_epochs * 0.5) / 3600  # ~0.5s per batch
print(f'\nEstimated training time: {estimated_hours:.1f} hours ({estimated_hours/24:.1f} days)')
print('With RTX 3080 Ti optimizations, expect 8-12 hours total.')


In [None]:
# Load the best trained model
model = MSMS2SmilesHybrid(vocab_size=vocab_size, d_model=config.D_MODEL, nhead=config.NHEAD, num_layers=config.NUM_LAYERS).to(device)
checkpoint = torch.load('best_msms_hybrid_fold_1.pt', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
print('Model loaded successfully')


In [None]:
# External dataset evaluation and visualization
model.eval()
external_metrics = {'tanimoto': [], 'dice': [], 'mcs': [], 'mw_diff': [], 'logp_diff': [], 'substructure': []}
pred_smiles_list = []
true_smiles_list = []
adducts_list = []
num_samples = min(5, len(external_dataset))

for sample_idx in range(num_samples):
    sample_spectrum = external_dataset[sample_idx][0]
    sample_graph = external_dataset[sample_idx][1]
    sample_ion_mode = external_dataset[sample_idx][3]
    sample_precursor_bin = external_dataset[sample_idx][4]
    sample_adduct_idx = external_dataset[sample_idx][5]
    true_smiles = external_dataset[sample_idx][6]

    predicted_results = beam_search(model, sample_spectrum, sample_graph, sample_ion_mode, sample_precursor_bin, sample_adduct_idx, true_smiles, beam_width=10, max_len=SUPERVISED_MAX_LEN, device=device)
    pred_smiles_list.extend([smiles for smiles, _ in predicted_results])
    true_smiles_list.extend([true_smiles] * len(predicted_results))
    adducts_list.extend([df_external.iloc[sample_idx]['adduct']] * len(predicted_results))

    print(f"\nExternal Sample {sample_idx} - True SMILES: {true_smiles}")
    print("Top Predicted SMILES:")
    for smiles, confidence in predicted_results[:3]:
        external_metrics['tanimoto'].append(tanimoto_similarity(smiles, true_smiles, all_fingerprints))
        external_metrics['dice'].append(dice_similarity(smiles, true_smiles))
        external_metrics['mcs'].append(mcs_similarity(smiles, true_smiles))
        external_metrics['mw_diff'].append(mw_difference(smiles, true_smiles))
        external_metrics['logp_diff'].append(logp_difference(smiles, true_smiles))
        external_metrics['substructure'].append(substructure_match(smiles, true_smiles, model.gnn_encoder.substructures))
        print(f"SMILES: {smiles}, Confidence: {confidence:.4f}, Tanimoto: {external_metrics['tanimoto'][-1]:.4f}, Dice: {external_metrics['dice'][-1]:.4f}, MCS: {external_metrics['mcs'][-1]:.4f}")
        if len(smiles) > 100 and smiles.count('C') > len(smiles) * 0.8:
            print("Warning: Predicted SMILES is a long carbon chain, indicating potential model underfitting.")
        if smiles != "Invalid SMILES":
            mol = Chem.MolFromSmiles(smiles, sanitize=True)
            if mol:
                print(f"Molecular Weight: {Descriptors.MolWt(mol):.2f}, LogP: {Descriptors.MolLogP(mol):.2f}")

    # Visualize molecules
    if predicted_results[0][0] != "Invalid SMILES":
        pred_mol = Chem.MolFromSmiles(predicted_results[0][0], sanitize=True)
        true_mol = Chem.MolFromSmiles(true_smiles, sanitize=True)
        if pred_mol and true_mol:
            img = Draw.MolsToGridImage([true_mol, pred_mol], molsPerRow=2, subImgSize=(300, 300), legends=['True', 'Predicted'])
            img_array = np.array(img.convert('RGB'))
            plt.figure(figsize=(10, 5))
            plt.imshow(img_array)
            plt.axis('off')
            plt.title(f"External Sample {sample_idx} - Tanimoto: {external_metrics['tanimoto'][0]:.4f}")
            plt.show()

    # Visualize attention and GNN weights for first sample
    if sample_idx == 0:
        with torch.no_grad():
            spectrum = sample_spectrum.unsqueeze(0).to(device)
            graph_data = Batch.from_data_list([sample_graph]).to(device)
            ion_mode_idx = torch.tensor([sample_ion_mode], dtype=torch.long).to(device)
            precursor_idx = torch.tensor([sample_precursor_bin], dtype=torch.long).to(device)
            adduct_idx = torch.tensor([sample_adduct_idx], dtype=torch.long).to(device)
            _, attn_weights = model.transformer_encoder(spectrum, ion_mode_idx, precursor_idx, adduct_idx)
            _, _, edge_weights = model.gnn_encoder(graph_data, ion_mode_idx, precursor_idx, adduct_idx)
            plot_attention_weights(attn_weights, title=f"External Fold Transformer Attention Weights")
            plot_gnn_edge_weights(edge_weights, sample_graph.edge_index, title=f"External Fold GNN Edge Importance")

# Final Evaluation
print(f"External Validity Rate: {validity_rate(pred_smiles_list):.2f}%")
print(f"External Prediction Diversity: {prediction_diversity(pred_smiles_list):.4f}")
print("External Metrics Summary:")
print(f"Avg Tanimoto: {np.mean(external_metrics['tanimoto']):.4f}")
print(f"Avg Dice: {np.mean(external_metrics['dice']):.4f}")
print(f"Avg MCS: {np.mean(external_metrics['mcs']):.4f}")
print(f"Avg MW Difference: {np.mean([x for x in external_metrics['mw_diff'] if x != float('inf')]):.2f}")
print(f"Avg LogP Difference: {np.mean([x for x in external_metrics['logp_diff'] if x != float('inf')]):.2f}")
print(f"Avg Substructure Match: {np.mean(external_metrics['substructure']):.4f}")
error_analysis(pred_smiles_list, true_smiles_list, adducts_list, all_fingerprints)
