In [98]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score


In [99]:
'''
loading adr data
'''

adrData = pd.read_parquet("../Data/1. Adr_embeddings/TFIDF_ADR_vectors/train/tfidf_long.parquet")
testAdrData = pd.read_parquet("../Data/1. Adr_embeddings/TFIDF_ADR_vectors/test/tfidf_long.parquet")


In [100]:
class ADRVectorizer:
    '''
    This right here my friend is a vectorizer.
    U give him all your Unique id list and he will create a vector.
    Consistancy is the name of the game.
    Heck you can even check the meddra ids from the vector back.
    Emojis for radito... ðŸ˜ºðŸ™Š
    '''
    def __init__(self, uniqueAdrs):
        self.uniqueAdrs = sorted(uniqueAdrs)
        self.adrToIdx = {adr: idx for idx, adr in enumerate(self.uniqueAdrs)}
        self.idxToAdr = {idx: adr for idx, adr in enumerate(self.uniqueAdrs)}
        self.numAdrs = len(self.uniqueAdrs)
    
    def getVector(self, drugAdrs):
        vector = np.zeros(self.numAdrs, dtype=np.float32)
        for adr in drugAdrs:
            if adr in self.adrToIdx:
                vector[self.adrToIdx[adr]] = 1.0

        return vector
    
    def getAdrsFromVector(self, vector):
        presentIndices = np.where(vector > 0)[0]
        adrs = [self.idxToAdr[idx] for idx in presentIndices if idx in self.idxToAdr]
        
        return adrs
    
  

In [101]:
'''
creating the vectors for the drugs..
emojis section for radito.. ðŸ™€ðŸ™‰ðŸ˜ºðŸ™Š
'''
uniqueAdrs = adrData['meddra_id'].unique()
adrVectorizer = ADRVectorizer(uniqueAdrs= uniqueAdrs)


In [102]:
'''
syke, last cell was not creating the vector
more emojis.. ðŸ™€ðŸ™‰ðŸ˜ºðŸ™Š
'''
def adrVectorPd(adrData, vectorizer):
    adrVectorsData = []

    for rxcui, group in adrData.groupby('rxcui'):
        drugAdrs = group['meddra_id'].tolist()

        adrVector = vectorizer.getVector( drugAdrs)
        adrVectorsData.append({
        'rxcui': rxcui,
        'adr'  : adrVector
    })


    adrVectorized = pd.DataFrame(adrVectorsData)
    return adrVectorized

adrVectorized = adrVectorPd(adrData, adrVectorizer)
testAdrVectorized = adrVectorPd(testAdrData, adrVectorizer)

print(adrVectorized.head(2))
print(testAdrVectorized.head(2))


     rxcui                                                adr
0  1000082  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    10109  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
     rxcui                                                adr
0  1037042  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    10432  [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...


In [103]:
'''
Loading in dti data
'''
dtiPd = pd.read_parquet('../Data/scope_onside_common_v3.parquet')
dtiPd.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34741 entries, 0 to 34740
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   drug_chembl_id     34741 non-null  object
 1   target_uniprot_id  34741 non-null  object
 2   label              34741 non-null  int64 
 3   smiles             34741 non-null  object
 4   sequence           34741 non-null  object
 5   molfile_3d         34741 non-null  object
 6   rxcui              34741 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.9+ MB


In [104]:
'''
loading drug and protein embedding dataset
'''

proteinEmbed = pd.read_parquet('../Data/3. Protein_enbeddings/GVP-GNN_protein_embeddings.parquet')
drugEmbed = pd.read_parquet('../Data/2. Drug_embeddings/EGNN_drug_embeddings_v2.parquet')


In [105]:
print(proteinEmbed.info())
print(drugEmbed.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2382 entries, 0 to 2381
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   uniprot_id       2382 non-null   object 
 1   length           2382 non-null   int64  
 2   mean_pLDDT       2382 non-null   float64
 3   embedding_dim    2382 non-null   int64  
 4   encoder_version  2382 non-null   object 
 5   pdb_md5          2382 non-null   object 
 6   embedding        2382 non-null   object 
dtypes: float64(1), int64(2), object(4)
memory usage: 130.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028 entries, 0 to 1027
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   drug_chembl_id  1028 non-null   object
 1   rxcui           1028 non-null   object
 2   embedding       1028 non-null   object
dtypes: object(3)
memory usage: 24.2+ KB
None


In [106]:
'''
Class to load dataset for some basic checking
and to get access to dimension check fuctions
and retrieving dimension values for easy access
Insert emoji for radito..
'''
class Dataset:  
    def __init__(self, data=None):

        self.data = data
        self.proteinDim = None
        self.drugDim = None
        
        if data is not None:
            self._calculateDimensions()

    def _calculateDimensions(self):
        if len(self.data) > 0:
            sample_prot_embed = self.data.iloc[0]['embedding_prot']
            if hasattr(sample_prot_embed, 'shape'):
                self.proteinDim = sample_prot_embed.shape[0]
            else:
                self.proteinDim = len(sample_prot_embed)
            
            sample_drug_embed = self.data.iloc[0]['embedding_drug']
            if hasattr(sample_drug_embed, 'shape'):
                self.drugDim = sample_drug_embed.shape[0]
            else:
                self.drugDim = len(sample_drug_embed)

    def getProteinDimension(self):
        return self.proteinDim
    
    def getDrugDimension(self):
        return self.drugDim
    
    def getData(self):
        return self.data
    
    def checkDimension(self, fieldName):
        if fieldName not in self.data.columns:
            raise ValueError(f"Field '{fieldName}' not found in dataset. Available fields: {list(self.data.columns)}")
        
        # Get the first entry to determine expected dimension
        firstEntry = self.data[fieldName].iloc[0]
        
        if hasattr(firstEntry, 'shape'):
            expectedDim = firstEntry.shape[0] if len(firstEntry.shape) > 0 else 1

        else:
            expectedDim = len(firstEntry)
        
        # Check consistency across all entries
        inconsistentIndices = []
        for idx, value in enumerate(self.data[fieldName]):
            if hasattr(value, 'shape'):
                currentDim = value.shape[0] if len(value.shape) > 0 else 1

            else:
                currentDim = len(value)
            
            if currentDim != expectedDim:
                inconsistentIndices.append(idx)
        
        if inconsistentIndices:
            message = f"Dimension inconsistency in '{fieldName}': Expected {expectedDim}, but found different dimensions at indices {inconsistentIndices}"
            return False, expectedDim, message
        else:
            message = f"All entries in '{fieldName}' have consistent dimension: {expectedDim}"
            return True, expectedDim, message
    
    def getFeaturesAndLabels(self):
        proteinEmbeddings = np.stack(self.data['embedding_prot'].values)
        drugEmbeddings = np.stack(self.data['embedding_drug'].values)
        dtiLabels = self.data['label'].values
        adrVectors = np.stack(self.data['adr'].values)
        
        return proteinEmbeddings, drugEmbeddings, dtiLabels, adrVectors
    
    def info(self):
        if self.data is None:
            print("No data loaded.")
            return
        
        print(f"Dataset shape: {self.data.shape}")
        print(f"Protein embedding dimension: {self.getProteinDimension()}")
        print(f"Drug embedding dimension: {self.getDrugDimension()}")
        print(f"Number of samples: {len(self.data)}")
        print(f"Number of unique drugs: {self.data['drug_chembl_id'].nunique()}")
        print(f"Number of unique proteins: {self.data['target_uniprot_id'].nunique()}")

In [107]:
'''
This funciton will make sure to merge all the datas
to one single pd which we will feed the dataset class.

Note!
Do whatever in the function or the params but make sure
your return pd contains the following field
'drug_chembl_id', 'target_uniprot_id', 'label', 
'smiles', 'sequence', 'embedding_prot', 'embedding_drug', 'adr'

!important Insert emoji for radito..
'''

def prepareDataset(adrVectorized, dtiPd, proteinEmbed, drugEmbed):
    """
    Pre-process and merge all dataframes to create the final dataset
    """
    # Merge drug embeddings with DTI data
    drugData = pd.merge(dtiPd, drugEmbed, 
                        on=['drug_chembl_id', 'rxcui'], 
                        how='inner')
    
    # Merge with protein embeddings
    proteinDrugData = pd.merge(drugData, proteinEmbed,
                                left_on='target_uniprot_id',
                                right_on='uniprot_id',
                                how='inner')
    
    # Merge with ADR vectors
    finalData = pd.merge(proteinDrugData, adrVectorized,
                         on='rxcui',
                         how='inner')
    
    # Select only the required columns and rename embeddings
    finalData = finalData[['drug_chembl_id', 'target_uniprot_id', 'label', 
                            'smiles', 'sequence', 'embedding_x', 'embedding_y', 'adr']]
    
    finalData = finalData.rename(columns={
        'embedding_x': 'embedding_prot',
        'embedding_y': 'embedding_drug'
    })
    
    return finalData

In [108]:
processedData = prepareDataset(adrVectorized, dtiPd, proteinEmbed, drugEmbed)
dataset = Dataset(processedData)

dataset.info()

Dataset shape: (22228, 8)
Protein embedding dimension: 256
Drug embedding dimension: 1024
Number of samples: 22228
Number of unique drugs: 721
Number of unique proteins: 2063


In [109]:
print(dataset.checkDimension('embedding_prot'))
print(dataset.checkDimension('embedding_drug'))

(True, 256, "All entries in 'embedding_prot' have consistent dimension: 256")
(True, 1024, "All entries in 'embedding_drug' have consistent dimension: 1024")


In [110]:
testProcessedData = prepareDataset(testAdrVectorized, dtiPd, proteinEmbed, drugEmbed)

test_dataset = Dataset(testProcessedData)
test_dataset.info()

Dataset shape: (7595, 8)
Protein embedding dimension: 256
Drug embedding dimension: 1024
Number of samples: 7595
Number of unique drugs: 156
Number of unique proteins: 1209


In [111]:
print(test_dataset.checkDimension('embedding_prot'))
print(test_dataset.checkDimension('embedding_drug'))

(True, 256, "All entries in 'embedding_prot' have consistent dimension: 256")
(True, 1024, "All entries in 'embedding_drug' have consistent dimension: 1024")


In [112]:
class MultiModalDTIADRPrediction(nn.Module):
    def __init__(self, protDim, drugDim, adrDim):
        super().__init__()
        
        inputDim = protDim + drugDim
        hiddenDim = inputDim // 2
        subHiddenDim = hiddenDim // 2

        self.drugProtFuse = nn.Sequential(
            nn.Linear(inputDim, hiddenDim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hiddenDim, hiddenDim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hiddenDim, hiddenDim),
            nn.ReLU()
        )

        self.dtiHead = nn.Sequential(
            nn.Linear(hiddenDim, subHiddenDim),
            nn.ReLU(),
            nn.Linear(subHiddenDim, subHiddenDim//2),
            nn.ReLU(),
            nn.Linear(subHiddenDim//2, 1),
            nn.Sigmoid()
        )

        self.adrHead = nn.Sequential(
            nn.Linear(hiddenDim, hiddenDim*2),
            nn.ReLU(),
            nn.Linear(hiddenDim*2, hiddenDim*3),
            nn.ReLU(),
            nn.Linear(hiddenDim*3, adrDim),
            nn.Sigmoid()
        )
    
    def forward (self, proteinEmbed, drugEmbed):
        combined = torch.cat([proteinEmbed, drugEmbed], dim=1)

        fused = self.drugProtFuse(combined)

        dtiPred = self.dtiHead(fused)
        adrPred = self.adrHead(fused)

        return dtiPred, adrPred

In [113]:
proteinDim = dataset.getProteinDimension()
drugDim = dataset.getDrugDimension()
adrDim = adrVectorizer.numAdrs

print(
    "Protein dimension: ", proteinDim, 
    "\nDrug dimension   : ", drugDim,
    "\nADR dimension    : ", adrDim
)

model = MultiModalDTIADRPrediction(
    protDim=proteinDim,
    drugDim=drugDim,
    adrDim= adrDim
)

Protein dimension:  256 
Drug dimension   :  1024 
ADR dimension    :  4048


In [120]:
from tqdm import tqdm
import os

In [121]:
def testModel(model, testDataset, batch_size=32):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    protEmbed, drugEmbed, dtiLabels, adrVectors = testDataset.getFeaturesAndLabels()
    
    protEmbed = torch.FloatTensor(protEmbed)
    drugEmbed = torch.FloatTensor(drugEmbed)
    dtiLabels = torch.FloatTensor(dtiLabels)
    adrVectors = torch.FloatTensor(adrVectors)
    
    dataset = TensorDataset(protEmbed, drugEmbed, dtiLabels, adrVectors)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_dti_preds = []
    all_adr_preds = []
    all_dti_labels = []
    all_adr_labels = []
    
    with torch.no_grad():
        for batch_prot, batch_drug, batch_dti, batch_adr in tqdm(dataloader, desc='Testing'):
            batch_prot = batch_prot.to(device)
            batch_drug = batch_drug.to(device)
            
            dtiPred, adrPred = model(batch_prot, batch_drug)
            
            all_dti_preds.append(dtiPred.cpu())
            all_adr_preds.append(adrPred.cpu())
            all_dti_labels.append(batch_dti.cpu())
            all_adr_labels.append(batch_adr.cpu())
    
    dti_preds = torch.cat(all_dti_preds)
    adr_preds = torch.cat(all_adr_preds)
    dti_labels = torch.cat(all_dti_labels)
    adr_labels = torch.cat(all_adr_labels)
    
    # DTI Scores
    dti_binary = (dti_preds > 0.5).float()
    dti_accuracy = (dti_binary.squeeze() == dti_labels).float().mean()
    dti_f1 = f1_score(dti_labels, dti_binary)
    dti_auc = roc_auc_score(dti_labels, dti_preds)
    
    # ADR Scores
    adr_binary = (adr_preds > 0.5).float()
    adr_accuracy = (adr_binary == adr_labels).float().mean()
    adr_f1 = f1_score(adr_labels.flatten(), adr_binary.flatten(), average='macro')
    adr_auc = roc_auc_score(adr_labels.flatten(), adr_preds.flatten())
    
    print("=" * 50)
    print("MODEL EVALUATION SCORES")
    print("=" * 50)
    print("\nDTI PREDICTION SCORES:")
    print(f"Accuracy: {dti_accuracy:.4f}")
    print(f"F1-Score: {dti_f1:.4f}")
    print(f"ROC-AUC:  {dti_auc:.4f}")
    
    print("\nADR PREDICTION SCORES:")
    print(f"Accuracy: {adr_accuracy:.4f}")
    print(f"F1-Score: {adr_f1:.4f}")
    print(f"ROC-AUC:  {adr_auc:.4f}")
    print("=" * 50)
    
    return {
        'dti_accuracy': dti_accuracy,
        'dti_f1': dti_f1,
        'dti_auc': dti_auc,
        'adr_accuracy': adr_accuracy,
        'adr_f1': adr_f1,
        'adr_auc': adr_auc,
        'dti_predictions': dti_preds,
        'adr_predictions': adr_preds,
        'dti_labels': dti_labels,
        'adr_labels': adr_labels
    }

In [122]:


def trainModel(model, trainDataset, testDataset, epochs, batch_size=32, lr=0.0001, 
               save_dir='models', continue_training=False, checkpoint_path=None):
    os.makedirs(save_dir, exist_ok=True)
    
    protEmbed, drugEmbed, dtiLabels, adrVectors = trainDataset.getFeaturesAndLabels()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5, verbose=True)
    dti_criterion = nn.BCELoss()
    adr_criterion = nn.BCELoss()
    
    start_epoch = 0
    best_loss = float('inf')
    
    # Continue from checkpoint if requested
    if continue_training and checkpoint_path and os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        start_epoch = checkpoint['epoch']
        best_loss = checkpoint['best_loss']
        print(f"Resumed training from epoch {start_epoch}, best loss: {best_loss:.4f}")
    
    protEmbed = torch.FloatTensor(protEmbed)
    drugEmbed = torch.FloatTensor(drugEmbed)
    dtiLabels = torch.FloatTensor(dtiLabels)
    adrVectors = torch.FloatTensor(adrVectors)
    
    dataset = TensorDataset(protEmbed, drugEmbed, dtiLabels, adrVectors)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    for epoch in range(start_epoch, epochs):
        model.train()
        epoch_loss = 0
        
        pbar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{epochs}')
        for batch_prot, batch_drug, batch_dti, batch_adr in pbar:
            batch_prot = batch_prot.to(device)
            batch_drug = batch_drug.to(device)
            batch_dti = batch_dti.to(device)
            batch_adr = batch_adr.to(device)
            
            optimizer.zero_grad()
            
            dtiPred, adrPred = model(batch_prot, batch_drug)
            
            dtiLoss = dti_criterion(dtiPred.squeeze(), batch_dti)
            adrLoss = adr_criterion(adrPred, batch_adr)
            totalLoss = dtiLoss + adrLoss
            
            totalLoss.backward()
            optimizer.step()
            
            epoch_loss += totalLoss.item()
            pbar.set_postfix({'Loss': f'{totalLoss.item():.4f}'})
        
        avg_loss = epoch_loss / len(dataloader)
        scheduler.step(avg_loss)
        
        print(f'Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}, LR: {optimizer.param_groups[0]["lr"]:.2e}')
        
        # Save best model
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), f'{save_dir}/best_model.pth')
            print(f'New best model saved with loss: {best_loss:.4f}')
        
        # Save checkpoint every 10 epochs
        if (epoch + 1) % 10 == 0:
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'best_loss': best_loss,
                'loss': avg_loss
            }
            torch.save(checkpoint, f'{save_dir}/checkpoint_epoch_{epoch+1}.pth')
            torch.save(model.state_dict(), f'{save_dir}/model_epoch_{epoch+1}.pth')
            print(f'Checkpoint saved at epoch {epoch+1}')
            
            # Evaluate on test dataset
            test_results = testModel(model, testDataset, batch_size=batch_size)
            print(f'Test Scores at Epoch {epoch+1}:')
            print(f'  DTI - Acc: {test_results["dti_accuracy"]:.4f}, F1: {test_results["dti_f1"]:.4f}, AUC: {test_results["dti_auc"]:.4f}')
            print(f'  ADR - Acc: {test_results["adr_accuracy"]:.4f}, F1: {test_results["adr_f1"]:.4f}, AUC: {test_results["adr_auc"]:.4f}')
    
    return avg_loss, best_loss

In [None]:
avg_loss, best_loss = trainModel(
    model, 
    dataset, 
    test_dataset, 
    save_dir='models_1',
    continue_training=False, 
    epochs=100 
)