In [None]:
#Set up the working directory and import necessary libraries
import os
import re
import math
import torch
import umap
import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit import Chem
import torch.nn as nn
import seaborn as sns
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from torch_geometric.loader import NeighborLoader
import matplotlib.pyplot as plt
from rdkit.Chem import AllChem
from tape import ProteinBertModel, TAPETokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel, T5Tokenizer, T5Model, T5EncoderModel,AutoModelForSeq2SeqLM

# Define the working directory
working_dir = r"/home/Desktop/re/DB2"

# Check if the directory exists
if not os.path.exists(working_dir):
    raise FileNotFoundError(f"Directory {working_dir} does not exist")

# Load the SMILES and proteins data into pandas dataframes
smiles_df = pd.read_csv(os.path.join(working_dir, 'unique_smiles.csv'))
proteins_df = pd.read_csv(os.path.join(working_dir, 'unique_proteins.csv'))

# Print the first few rows to ensure data is loaded correctly
print("First few rows of smiles.csv:")
print(smiles_df.head())

print("\nFirst few rows of proteins.csv:")
print(proteins_df.head())

In [None]:
#########################################################Step1#########################################################################
# Set up device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Initialize ChemBERTa for drugs
tokenizer_chemberta = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model_chemberta = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

# Function to extract ChemBERTa embeddings for a SMILES string
def get_chemberta_embedding(smiles):
    inputs = tokenizer_chemberta(smiles, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model_chemberta(**inputs)
    # Taking the mean of the hidden states as the embedding
    return outputs.last_hidden_state.mean(dim=1).detach().numpy().flatten()

# Process drugs
drug_embeddings = []
for _, row in tqdm(smiles_df.iterrows(), total=len(smiles_df), desc="Processing drugs"):
    smiles = row['Ligand SMILES']
    drug_id = row['DrugBank ID']
    
    # Get ChemBERTa embeddings
    chemberta_emb = get_chemberta_embedding(smiles)
    
    # Append the DrugBank ID and the embeddings as a row
    drug_embeddings.append([drug_id] + chemberta_emb.tolist())

# Convert to a DataFrame with proper column names
embedding_columns = [f"embedding_dim_{i+1}" for i in range(chemberta_emb.shape[0])]
drug_embeddings_df = pd.DataFrame(drug_embeddings, columns=['DrugBank ID'] + embedding_columns)

# Save drug embeddings to CSV
drug_embeddings_df.to_csv(os.path.join(working_dir, 'ChemBERT_drug_embeddings.csv'), index=False)

# Initialize the tokenizer and model for TAPE (Protein embeddings)
tokenizer_tape = TAPETokenizer(vocab="iupac")
model_tape = ProteinBertModel.from_pretrained("bert-base")

# Function to compute Morgan fingerprints (for drugs)
def get_morgan_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"Invalid SMILES: {smiles}")
        return np.zeros(n_bits)  # Return a zero vector for invalid SMILES
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return np.array(fp)

# Function to clean and validate protein sequences
def clean_sequence(sequence):
    valid_amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
    cleaned_sequence = ''.join([aa for aa in sequence if aa.upper() in valid_amino_acids])
    return cleaned_sequence.upper()

# Function to compute TAPE embeddings (for proteins)
def get_tape_embedding(sequence):
    # Clean and validate the sequence
    cleaned_sequence = clean_sequence(sequence)
    if len(cleaned_sequence) == 0:
        print(f"Invalid sequence: {sequence}")
        return np.zeros(model_tape.config.hidden_size)  # Return a zero vector for invalid sequences

    # Tokenize the sequence
    tokens = tokenizer_tape.tokenize(cleaned_sequence)
    input_ids = torch.tensor([tokenizer_tape.convert_tokens_to_ids(tokens)]).to(torch.long)
    
    # Forward pass through TAPE model
    with torch.no_grad():
        outputs = model_tape(input_ids)
    
    # Access the first element in the tuple (final hidden states)
    hidden_states = outputs[0]
    
    # Use mean pooling over all tokens to get a fixed-size embedding
    embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()
    return embedding

# Initialize lists to store the results
drug_morgan_fps = []
drug_ids = []
protein_tape_embeddings = []
protein_ids = []

# Extract Morgan fingerprints for drugs
print("Processing Morgan fingerprints for drugs...")
for _, row in tqdm(smiles_df.iterrows(), total=len(smiles_df)):
    smiles = row['Ligand SMILES']
    drug_id = row['DrugBank ID']
    
    # Compute Morgan fingerprint
    morgan_fp = get_morgan_fingerprint(smiles)
    
    # Store the results
    drug_morgan_fps.append(morgan_fp)
    drug_ids.append(drug_id)

# Convert drug fingerprints to a numpy array and save to a CSV file
drug_morgan_fps = np.array(drug_morgan_fps)
drug_fps_df = pd.DataFrame(drug_morgan_fps, columns=[f'fingerprint_{i+1}' for i in range(drug_morgan_fps.shape[1])])
drug_fps_df.insert(0, 'DrugBank ID', drug_ids)  # Insert DrugBank ID as the first column
drug_fps_df.to_csv(os.path.join(working_dir, 'drug_morgan_fingerprints.csv'), index=False)
print("Morgan fingerprints for drugs saved to 'drug_morgan_fingerprints.csv'.")

# Extract TAPE embeddings for proteins
print("Processing TAPE embeddings for proteins...")
for _, row in tqdm(proteins_df.iterrows(), total=len(proteins_df)):
    sequence = row['Sequence']
    protein_id = row['UniProt ID']
    
    # Compute TAPE embedding
    try:
        tape_emb = get_tape_embedding(sequence)
        protein_tape_embeddings.append(tape_emb)
        protein_ids.append(protein_id)
    except Exception as e:
        print(f"Error processing protein {protein_id}: {e}")
        continue  # Skip this protein and move on to the next

# Convert protein embeddings to a numpy array and save to a CSV file
protein_tape_embeddings = np.array(protein_tape_embeddings)
protein_emb_df = pd.DataFrame(protein_tape_embeddings, columns=[f'embedding_dim_{i+1}' for i in range(protein_tape_embeddings.shape[1])])
protein_emb_df.insert(0, 'UniProt ID', protein_ids)  # Insert UniProt ID as the first column
protein_emb_df.to_csv(os.path.join(working_dir, 'protein_tape_embeddings.csv'), index=False)
print("TAPE embeddings for proteins saved to 'protein_tape_embeddings.csv'.")

# Load the protein data from the CSV file
proteins_df = pd.read_csv(os.path.join(working_dir, 'unique_proteins.csv'))

# Initialize the tokenizer and model for ProstT5 (from Hugging Face)
model_name = "Rostlab/ProstT5"
tokenizer = T5Tokenizer.from_pretrained(model_name, do_lower_case=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set up the device to use (GPU if available, otherwise CPU)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Use half-precision on GPUs, full-precision on CPU
if device == 'cpu':
    model.float()
else:
    model.half()

# Function to clean and validate protein sequences
def clean_sequence(sequence):
    # Keep only valid amino acids (A-Z)
    valid_amino_acids = set("ACDEFGHIKLMNPQRSTVWY")
    cleaned_sequence = ''.join([aa for aa in sequence if aa.upper() in valid_amino_acids])
    return cleaned_sequence.upper()

# Function to get ProstT5 embeddings for a protein sequence
def get_prostt5_embedding(sequence):
    # Clean and validate the sequence
    cleaned_sequence = clean_sequence(sequence)
    
    if len(cleaned_sequence) == 0:
        print(f"Invalid sequence: {sequence}")
        return np.zeros(model.config.d_model)  # Return a zero vector if the sequence is invalid

    # Add pre-fix for amino acid sequence
    sequence_with_prefix = "<AA2fold> " + cleaned_sequence

    # Tokenize the sequence and convert to tensor
    inputs = tokenizer(sequence_with_prefix, return_tensors="pt", padding=True, truncation=True).to(device)
    
    # Forward pass through the ProstT5 model to get the hidden states
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    
    # Extract the embeddings (mean of the last hidden states)
    hidden_states = outputs.last_hidden_state
    embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()  # Mean pooling over all tokens
    
    return embedding

# Initialize lists to store protein embeddings and protein IDs
protein_embeddings = []
protein_ids = []

# Loop over the protein sequences and extract embeddings
for _, row in tqdm(proteins_df.iterrows(), total=len(proteins_df), desc="Processing proteins"):
    sequence = row['Sequence']
    protein_id = row['UniProt ID']
    
    # Get the ProstT5 embedding for each protein sequence
    try:
        embedding = get_prostt5_embedding(sequence)
        protein_embeddings.append(embedding)
        protein_ids.append(protein_id)
    except Exception as e:
        print(f"Error processing protein {protein_id}: {e}")
        continue  # Skip this protein and move on to the next

# Convert the list of embeddings into a numpy array and then into a DataFrame
protein_embeddings = np.array(protein_embeddings)
embedding_columns = [f'embedding_dim_{i+1}' for i in range(protein_embeddings.shape[1])]
protein_emb_df = pd.DataFrame(protein_embeddings, columns=embedding_columns)

# Insert the UniProt IDs as the first column
protein_emb_df.insert(0, 'UniProt ID', protein_ids)

# Save the embeddings to a CSV file
protein_emb_df.to_csv(os.path.join(working_dir, 'protein_prostt5_embeddings.csv'), index=False)

print("ProstT5 embeddings for proteins saved to 'protein_prostt5_embeddings.csv'.")

In [None]:
##########################################Step3#############################################################################################
#70-15-15 split
#Load the embeddings and Kd values from CSV files
chemberta_drug_embeddings = pd.read_csv(os.path.join(working_dir, 'ChemBERT_drug_embeddings.csv'))
morgan_drug_fingerprints = pd.read_csv(os.path.join(working_dir, 'drug_morgan_fingerprints.csv'))
prostt5_protein_embeddings = pd.read_csv(os.path.join(working_dir, 'protein_prostt5_embeddings.csv'))
tape_protein_embeddings = pd.read_csv(os.path.join(working_dir, 'protein_tape_embeddings.csv'))
kd_values = pd.read_csv(os.path.join(working_dir, 'Links_Kd_Scores.csv'))  # with 'DrugBank ID', 'UniProt ID', 'Kd'

#Log-normalize the Kd values
kd_values['Log_Kd'] = kd_values['Kd(nM)'].apply(lambda x: math.log10(x + 1))  # Adding 1 to avoid log(0)

#Merge embeddings
# Merge drug embeddings (ChemBERTa + Morgan fingerprints) using 'DrugBank ID'
drug_embeddings = pd.merge(chemberta_drug_embeddings, morgan_drug_fingerprints, on='DrugBank ID')

# Merge protein embeddings (ProstT5 + TAPE embeddings) using 'UniProt ID'
protein_embeddings = pd.merge(prostt5_protein_embeddings, tape_protein_embeddings, on='UniProt ID')

# Merge the drug-protein pairs with Kd values
merged_df = pd.merge(kd_values, drug_embeddings, on='DrugBank ID')
merged_df = pd.merge(merged_df, protein_embeddings, on='UniProt ID')

#Z-score normalization for ChemBERTa, ProstT5, and TAPE embeddings
chemberta_cols = [col for col in chemberta_drug_embeddings.columns if 'embedding_dim_' in col]
prostt5_cols = [col for col in prostt5_protein_embeddings.columns if 'embedding_dim_' in col]
tape_cols = [col for col in tape_protein_embeddings.columns if 'embedding_dim_' in col]
morgan_cols = [col for col in morgan_drug_fingerprints.columns if 'fingerprint_' in col]  # No normalization for Morgan

# Normalizing ChemBERTa, ProstT5, and TAPE embeddings
scaler_chemberta = StandardScaler()
scaler_prostt5 = StandardScaler()
scaler_tape = StandardScaler()

# Normalize embeddings
merged_df[chemberta_cols] = scaler_chemberta.fit_transform(merged_df[chemberta_cols])
merged_df[prostt5_cols] = scaler_prostt5.fit_transform(merged_df[prostt5_cols])
merged_df[tape_cols] = scaler_tape.fit_transform(merged_df[tape_cols])

#Attention mechanism to combine embeddings
class AttentionCombiner(nn.Module):
    def __init__(self, chemberta_dim, morgan_dim, prostt5_dim, tape_dim):
        super(AttentionCombiner, self).__init__()
        self.chemberta_fc = nn.Linear(chemberta_dim, chemberta_dim)
        self.morgan_fc = nn.Linear(morgan_dim, morgan_dim)
        self.prostt5_fc = nn.Linear(prostt5_dim, prostt5_dim)
        self.tape_fc = nn.Linear(tape_dim, tape_dim)
        
        self.attention_layer = nn.Linear(chemberta_dim + morgan_dim + prostt5_dim + tape_dim, 1)

    def forward(self, chemberta, morgan, prostt5, tape):
        # Apply linear layers
        chemberta = self.chemberta_fc(chemberta)
        morgan = self.morgan_fc(morgan)
        prostt5 = self.prostt5_fc(prostt5)
        tape = self.tape_fc(tape)
        
        # Concatenate embeddings
        combined = torch.cat([chemberta, morgan, prostt5, tape], dim=1)
        
        # Apply attention
        attention_weights = torch.softmax(self.attention_layer(combined), dim=1)
        
        # Compute weighted combination
        combined_embedding = attention_weights * combined
        return combined_embedding

# Convert embeddings to tensors and apply attention mechanism
chemberta_tensor = torch.tensor(merged_df[chemberta_cols].values, dtype=torch.float32)
morgan_tensor = torch.tensor(merged_df[morgan_cols].values, dtype=torch.float32)
prostt5_tensor = torch.tensor(merged_df[prostt5_cols].values, dtype=torch.float32)
tape_tensor = torch.tensor(merged_df[tape_cols].values, dtype=torch.float32)

# Initialize attention combiner
combiner = AttentionCombiner(
    chemberta_dim=len(chemberta_cols),
    morgan_dim=len(morgan_cols),
    prostt5_dim=len(prostt5_cols),
    tape_dim=len(tape_cols)
)

# Apply attention mechanism to get the combined embedding for each drug-protein pair
with torch.no_grad():
    combined_embeddings = combiner(chemberta_tensor, morgan_tensor, prostt5_tensor, tape_tensor)

# Convert combined embeddings back to DataFrame
combined_embeddings_df = pd.DataFrame(combined_embeddings.numpy())

# Add combined embeddings and log-normalized Kd values to final dataset
final_dataset = pd.concat([merged_df[['DrugBank ID', 'UniProt ID', 'Log_Kd']], combined_embeddings_df], axis=1)

#Split the dataset into training (70%) and remaining (30%)
train_df, remaining_df = train_test_split(final_dataset, test_size=0.3, random_state=42)

#Split the remaining dataset into validation (15%) and test (15%)
val_df, test_df = train_test_split(remaining_df, test_size=0.5, random_state=42)

# Save the splits to CSV files in the working directory
train_df.to_csv(os.path.join(working_dir, 'train_dataset.csv'), index=False)
val_df.to_csv(os.path.join(working_dir, 'val_dataset.csv'), index=False)
test_df.to_csv(os.path.join(working_dir, 'test_dataset.csv'), index=False)

# Print a message to confirm successful completion
print("Data processing complete! Training, validation, and test datasets saved in the working directory.")

In [None]:
#################################################################Analyssis#################################################
# Load the datasets
train_df = pd.read_csv(os.path.join(working_dir, 'train_dataset.csv'))
val_df = pd.read_csv(os.path.join(working_dir, 'val_dataset.csv'))
test_df = pd.read_csv(os.path.join(working_dir, 'test_dataset.csv'))

# Function to assess dataset
def assess_dataset(dataset, name):
    print(f"\nAssessing {name} dataset:")
    
    # Check the shape of the dataset
    print(f"Shape: {dataset.shape}")
    
    # Check for NaN values
    nan_values = dataset.isna().sum()
    if nan_values.any():
        print(f"Missing values (NaNs) detected in the following columns:\n{nan_values[nan_values > 0]}")
    else:
        print("No missing values (NaNs) detected.")
    
    # Check for columns with constant values (variance = 0)
    constant_columns = dataset.columns[dataset.nunique() == 1]
    if len(constant_columns) > 0:
        print(f"Columns with constant values: {constant_columns.tolist()}")
    else:
        print("No columns with constant values detected.")
    
    # Summary statistics
    print("\nSummary statistics:")
    print(dataset.describe())
    
    # Check data types of each column
    print("\nData types of each column:")
    print(dataset.dtypes)
    
    # Check for any infinite values
    inf_values = (dataset == float('inf')).sum()
    if inf_values.any():
        print(f"Infinite values detected in the following columns:\n{inf_values[inf_values > 0]}")
    else:
        print("No infinite values detected.")
    
    # Check if the number of features (columns) is consistent across all datasets
    return dataset.shape[1]

# Assess each dataset
train_columns = assess_dataset(train_df, 'Training')
val_columns = assess_dataset(val_df, 'Validation')
test_columns = assess_dataset(test_df, 'Test')

# Check if the number of columns is consistent across train, val, and test sets
if train_columns == val_columns == test_columns:
    print("\nAll datasets have the same number of features.")
else:
    print("\nWarning: The number of features (columns) is inconsistent across the datasets.")

In [None]:
##################################################################Step4###############################################################
# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_file = os.path.join(working_dir, 'train_reduced_dataset.csv')
val_file = os.path.join(working_dir, 'val_reduced_dataset.csv')
test_file = os.path.join(working_dir, 'test_reduced_dataset.csv')

# Dataset class
class KDDataset(Dataset):
    def __init__(self, data):
        self.drug_protein_embedding = data.iloc[:, 3:].values  # All embeddings (drug + protein)
        self.log_kd = data['Log_Kd'].values

    def __len__(self):
        return len(self.log_kd)

    def __getitem__(self, idx):
        return torch.tensor(self.drug_protein_embedding[idx], dtype=torch.float32), torch.tensor(self.log_kd[idx], dtype=torch.float32)

# Load datasets
def load_data(file_path):
    data = pd.read_csv(file_path)
    return KDDataset(data)

# Load the datasets
train_dataset = load_data(train_file)
val_dataset = load_data(val_file)
test_dataset = load_data(test_file)

# DataLoader
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# FCNN Model
class FCNN(nn.Module):
    def __init__(self, input_dim):
        super(FCNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, 1)  # Output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# Get the input dimension
input_dim = train_dataset[0][0].shape[0]

# Instantiate the model
model = FCNN(input_dim).to(device)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Training loop (without early stopping)
def train_model(model, train_loader, val_loader, num_epochs=50):
    train_losses, val_losses = [], []
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        
        for batch in train_loader:
            drug_embedding, log_kd = batch
            drug_embedding, log_kd = drug_embedding.to(device), log_kd.to(device)
            
            optimizer.zero_grad()
            outputs = model(drug_embedding)
            loss = criterion(outputs.squeeze(), log_kd)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        train_losses.append(train_loss / len(train_loader))

        # Validation loop
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                drug_embedding, log_kd = batch
                drug_embedding, log_kd = drug_embedding.to(device), log_kd.to(device)
                outputs = model(drug_embedding)
                loss = criterion(outputs.squeeze(), log_kd)
                val_loss += loss.item()

        val_losses.append(val_loss / len(val_loader))

        print(f"Epoch [{epoch + 1}/{num_epochs}] - Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")

    return model, train_losses, val_losses


#evaluate model
def evaluate_model(model, test_loader):
    model.eval()
    test_loss = 0.0
    criterion = nn.MSELoss()
    
    predictions = []
    targets = []

    with torch.no_grad():
        for batch in test_loader:
            drug_embedding, log_kd = batch
            drug_embedding = drug_embedding.to(device)
            log_kd = log_kd.to(device)

            outputs = model(drug_embedding)

            loss = criterion(outputs.squeeze(), log_kd)
            test_loss += loss.item()

            predictions.append(outputs.cpu().numpy())
            targets.append(log_kd.cpu().numpy())

    predictions = np.concatenate(predictions)
    targets = np.concatenate(targets)

    mse = mean_squared_error(targets, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(targets, predictions)
    r2 = r2_score(targets, predictions)
    mape = np.mean(np.abs((targets - predictions) / targets)) * 100  # MAPE in percentage

    print(f"Test Loss: {test_loss / len(test_loader):.4f}")
    print(f"Test MSE: {mse:.4f}, Test RMSE: {rmse:.4f}, Test MAE: {mae:.4f}, Test R²: {r2:.4f}, Test MAPE: {mape:.2f}%")
    
    return mse, rmse, mae, r2, mape

# Function to plot training and validation loss
def plot_training_history(train_losses, val_losses):
    plt.figure()
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss over Epochs')
    plt.legend()
    plt.show()

# Train the model
trained_model, train_losses, val_losses = train_model(model, train_loader, val_loader)

# Plot training and validation loss
plot_training_history(train_losses, val_losses)

# Evaluate the model on the test set and gather metrics
mse, rmse, mae, r2, mape = evaluate_model(trained_model, test_loader)

# Visualize the additional metrics
def plot_metrics(mse, rmse, mae, r2, mape):
    metrics = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2,
        'MAPE (%)': mape
    }
    plt.figure()
    plt.bar(metrics.keys(), metrics.values())
    plt.xlabel('Metrics')
    plt.ylabel('Value')
    plt.title('Evaluation Metrics on Test Set')
    plt.show()

# Plot evaluation metrics
plot_metrics(mse, rmse, mae, r2, mape)

# Save the model
torch.save(trained_model.state_dict(), os.path.join(working_dir, 'fcnn_model.pth'))

In [None]:
# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the working directory and file paths
new_drug_file = os.path.join(working_dir, 'New_drugs.csv')

# Load new drugs
new_drugs_df = pd.read_csv(new_drug_file)

################################### ChemBERTa Embeddings ###################################

# Load ChemBERTa model and tokenizer
tokenizer_chemberta = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model_chemberta = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1").to(device)

# Function to extract ChemBERTa embeddings
def get_chemberta_embedding(smiles):
    inputs = tokenizer_chemberta(smiles, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model_chemberta(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()

# Initialize lists to store embeddings
new_drug_embeddings = []

# Loop through each drug and extract ChemBERTa embeddings
for _, row in new_drugs_df.iterrows():
    smiles = row['Ligand SMILES']
    drug_id = row['DrugBank ID']
    
    # Get ChemBERTa embedding
    chemberta_emb = get_chemberta_embedding(smiles)
    
    # Store embedding with DrugBank ID
    new_drug_embeddings.append([drug_id] + chemberta_emb.tolist())

# Convert embeddings to DataFrame
chemberta_df = pd.DataFrame(new_drug_embeddings, columns=['DrugBank ID'] + [f'embedding_dim_{i+1}' for i in range(len(new_drug_embeddings[0]) - 1)])

# Save ChemBERTa embeddings to CSV
chemberta_output_file = os.path.join(working_dir, 'New_ChemBERT_drug_embeddings.csv')
chemberta_df.to_csv(chemberta_output_file, index=False)

print(f"ChemBERTa embeddings saved to {chemberta_output_file}")

################################### Morgan Fingerprints ###################################

# Function to compute Morgan fingerprints
def get_morgan_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)  # Return a zero vector for invalid SMILES
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    return np.array(fp)

# Initialize list to store fingerprints
new_morgan_fingerprints = []

# Loop through each drug and compute Morgan fingerprints
for _, row in new_drugs_df.iterrows():
    smiles = row['Ligand SMILES']
    drug_id = row['DrugBank ID']
    
    # Get Morgan fingerprint
    morgan_fp = get_morgan_fingerprint(smiles)
    
    # Store fingerprint with DrugBank ID
    new_morgan_fingerprints.append([drug_id] + morgan_fp.tolist())

# Convert fingerprints to DataFrame
morgan_df = pd.DataFrame(new_morgan_fingerprints, columns=['DrugBank ID'] + [f'fingerprint_{i+1}' for i in range(len(new_morgan_fingerprints[0]) - 1)])

# Save Morgan fingerprints to CSV
morgan_output_file = os.path.join(working_dir, 'New_drug_morgan_fingerprints.csv')
morgan_df.to_csv(morgan_output_file, index=False)

print(f"Morgan fingerprints saved to {morgan_output_file}")

import os
import torch
import pandas as pd
from transformers import T5Tokenizer, T5EncoderModel
from tape import ProteinBertModel, TAPETokenizer

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

new_protein_file = os.path.join(working_dir, 'Book.csv')

# Load new proteins
new_proteins_df = pd.read_csv(new_protein_file)

################################### ProstT5 Embeddings ###################################

# Load ProstT5 tokenizer and model
tokenizer_prostt5 = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)
model_prostt5 = T5EncoderModel.from_pretrained('Rostlab/ProstT5').to(device)

# Function to preprocess and tokenize protein sequences for ProstT5
def preprocess_and_tokenize_prostt5(sequences):
    sequences = [" ".join(list(seq)) for seq in sequences]  # Add space between each amino acid
    sequences = ["<AA2fold> " + seq for seq in sequences]   # Add special token "<AA2fold>"
    return tokenizer_prostt5.batch_encode_plus(sequences, add_special_tokens=True, padding="longest", return_tensors='pt').to(device)

# Function to extract ProstT5 embeddings
def extract_prostt5_embeddings(sequences):
    tokenized_seqs = preprocess_and_tokenize_prostt5(sequences)
    with torch.no_grad():
        embeddings = model_prostt5(input_ids=tokenized_seqs.input_ids, attention_mask=tokenized_seqs.attention_mask)
    per_protein_embeddings = []
    for i in range(len(sequences)):
        seq_len = (tokenized_seqs.attention_mask[i] == 1).sum() - 1  # Exclude padding
        per_protein_emb = embeddings.last_hidden_state[i, 1:seq_len+1].mean(dim=0).cpu().numpy()  # Mean pooling
        per_protein_embeddings.append(per_protein_emb)
    return per_protein_embeddings

# Extract ProstT5 embeddings for all proteins
new_protein_embeddings_prostt5 = []
for _, row in new_proteins_df.iterrows():
    sequence = row['Sequence']
    protein_id = row['UniProt ID']
    
    # Extract ProstT5 embedding
    prostt5_emb = extract_prostt5_embeddings([sequence])[0]
    
    # Store embedding with UniProt ID
    new_protein_embeddings_prostt5.append([protein_id] + prostt5_emb.tolist())

# Convert ProstT5 embeddings to DataFrame
prostt5_df = pd.DataFrame(new_protein_embeddings_prostt5, columns=['UniProt ID'] + [f'embedding_dim_{i+1}' for i in range(len(new_protein_embeddings_prostt5[0]) - 1)])

# Save ProstT5 embeddings to CSV
prostt5_output_file = os.path.join(working_dir, 'New_protein_prostt5_embeddings.csv')
prostt5_df.to_csv(prostt5_output_file, index=False)

print(f"ProstT5 embeddings saved to {prostt5_output_file}")

################################### TAPE Embeddings ###################################

# Load TAPE tokenizer and model
tokenizer_tape = TAPETokenizer(vocab="iupac")
model_tape = ProteinBertModel.from_pretrained("bert-base")

# Function to extract TAPE embeddings
def extract_tape_embeddings(sequence):
    tokens = tokenizer_tape.tokenize(sequence)
    input_ids = torch.tensor([tokenizer_tape.convert_tokens_to_ids(tokens)]).to(torch.long)
    with torch.no_grad():
        outputs = model_tape(input_ids)
    return outputs[0].mean(dim=1).squeeze().cpu().numpy()  # Mean pooling over the tokens

# Extract TAPE embeddings for all proteins
new_protein_embeddings_tape = []
for _, row in new_proteins_df.iterrows():
    sequence = row['Sequence']
    protein_id = row['UniProt ID']
    
    # Extract TAPE embedding
    tape_emb = extract_tape_embeddings(sequence)
    
    # Store embedding with UniProt ID
    new_protein_embeddings_tape.append([protein_id] + tape_emb.tolist())

# Convert TAPE embeddings to DataFrame
tape_df = pd.DataFrame(new_protein_embeddings_tape, columns=['UniProt ID'] + [f'embedding_dim_{i+1}' for i in range(len(new_protein_embeddings_tape[0]) - 1)])

# Save TAPE embeddings to CSV
tape_output_file = os.path.join(working_dir, 'New_protein_tape_embeddings.csv')
tape_df.to_csv(tape_output_file, index=False)

print(f"TAPE embeddings saved to {tape_output_file}")

import pandas as pd
import os
from itertools import product

# Load the drugs and proteins data into pandas dataframes
drugs_df = pd.read_csv(os.path.join(working_dir, 'New_drugs.csv'))  # New set of drugs
proteins_df = pd.read_csv(os.path.join(working_dir, 'Book.csv'))  # New set of proteins

# Generate all possible combinations of drug and protein pairs using Cartesian product
all_pairs = list(product(drugs_df['DrugBank ID'], proteins_df['UniProt ID']))

# Create a dataframe to store these pairs
pairs_df = pd.DataFrame(all_pairs, columns=['DrugBank ID', 'UniProt ID'])

# Save the pairs to a CSV file
pairs_file = os.path.join(working_dir, 'all_drug_protein_pairs.csv')
pairs_df.to_csv(pairs_file, index=False)

print(f"All possible drug-protein pairs saved to: {pairs_file}")


# Define AttentionCombiner class
class AttentionCombiner(nn.Module):
    def __init__(self, chemberta_dim, morgan_dim, prostt5_dim, tape_dim):
        super(AttentionCombiner, self).__init__()
        self.chemberta_fc = nn.Linear(chemberta_dim, chemberta_dim)
        self.morgan_fc = nn.Linear(morgan_dim, morgan_dim)
        self.prostt5_fc = nn.Linear(prostt5_dim, prostt5_dim)
        self.tape_fc = nn.Linear(tape_dim, tape_dim)
        
        # Attention layer to learn the importance of different embeddings
        self.attention_layer = nn.Linear(chemberta_dim + morgan_dim + prostt5_dim + tape_dim, 1)

    def forward(self, chemberta, morgan, prostt5, tape):
        # Apply linear layers to each type of embedding
        chemberta = self.chemberta_fc(chemberta)
        morgan = self.morgan_fc(morgan)
        prostt5 = self.prostt5_fc(prostt5)
        tape = self.tape_fc(tape)
        
        # Concatenate embeddings
        combined = torch.cat([chemberta, morgan, prostt5, tape], dim=1)
        
        # Apply attention weights
        attention_weights = torch.softmax(self.attention_layer(combined), dim=1)
        
        # Compute the final combined embedding
        combined_embedding = attention_weights * combined
        return combined_embedding

# File paths for separate embeddings and drug-protein pairs
pair_file = os.path.join(working_dir, 'all_drug_protein_pairs.csv')  # Generated drug-protein pair file
chemberta_file = os.path.join(working_dir, 'New_ChemBERT_drug_embeddings.csv')
morgan_file = os.path.join(working_dir, 'New_drug_morgan_fingerprints.csv')
prostt5_file = os.path.join(working_dir, 'New_protein_prostt5_embeddings.csv')
tape_file = os.path.join(working_dir, 'New_protein_tape_embeddings.csv')

# Load the separate embeddings and the pairs file
pairs_df = pd.read_csv(pair_file)
chemberta_df = pd.read_csv(chemberta_file)
morgan_df = pd.read_csv(morgan_file)
prostt5_df = pd.read_csv(prostt5_file)
tape_df = pd.read_csv(tape_file)

# Extract embedding columns for each type
chemberta_cols = [col for col in chemberta_df.columns if 'embedding_dim_' in col]
morgan_cols = [col for col in morgan_df.columns if 'fingerprint_' in col]
prostt5_cols = [col for col in prostt5_df.columns if 'embedding_dim_' in col]
tape_cols = [col for col in tape_df.columns if 'embedding_dim_' in col]

#Z-score normalization for ChemBERTa, ProstT5, and TAPE embeddings (Morgan remains unchanged)
scaler_chemberta = StandardScaler()
scaler_prostt5 = StandardScaler()
scaler_tape = StandardScaler()

chemberta_df[chemberta_cols] = scaler_chemberta.fit_transform(chemberta_df[chemberta_cols])
prostt5_df[prostt5_cols] = scaler_prostt5.fit_transform(prostt5_df[prostt5_cols])
tape_df[tape_cols] = scaler_tape.fit_transform(tape_df[tape_cols])

#Initialize lists to store mapped embeddings
mapped_chemberta = []
mapped_morgan = []
mapped_prostt5 = []
mapped_tape = []
drug_ids = []
protein_ids = []

#Map embeddings to drug-protein pairs
for _, pair in pairs_df.iterrows():
    drug_id = pair['DrugBank ID']
    protein_id = pair['UniProt ID']
    
    # Find the row in each embedding DataFrame based on DrugBank ID and UniProt ID
    drug_row_chemberta = chemberta_df[chemberta_df['DrugBank ID'] == drug_id]
    drug_row_morgan = morgan_df[morgan_df['DrugBank ID'] == drug_id]
    protein_row_prostt5 = prostt5_df[prostt5_df['UniProt ID'] == protein_id]
    protein_row_tape = tape_df[tape_df['UniProt ID'] == protein_id]
    
    if drug_row_chemberta.empty or drug_row_morgan.empty or protein_row_prostt5.empty or protein_row_tape.empty:
        continue  # Skip if no matching row is found
    
    # Extract embeddings for the drug and protein
    chemberta_embedding = drug_row_chemberta[chemberta_cols].values.flatten()
    morgan_embedding = drug_row_morgan[morgan_cols].values.flatten()
    prostt5_embedding = protein_row_prostt5[prostt5_cols].values.flatten()
    tape_embedding = protein_row_tape[tape_cols].values.flatten()
    
    # Append embeddings to lists
    mapped_chemberta.append(chemberta_embedding)
    mapped_morgan.append(morgan_embedding)
    mapped_prostt5.append(prostt5_embedding)
    mapped_tape.append(tape_embedding)
    drug_ids.append(drug_id)
    protein_ids.append(protein_id)

#Convert mapped embeddings to PyTorch tensors
chemberta_tensor = torch.tensor(np.array(mapped_chemberta), dtype=torch.float32)
morgan_tensor = torch.tensor(np.array(mapped_morgan), dtype=torch.float32)
prostt5_tensor = torch.tensor(np.array(mapped_prostt5), dtype=torch.float32)
tape_tensor = torch.tensor(np.array(mapped_tape), dtype=torch.float32)

#Initialize the AttentionCombiner and combine embeddings
combiner = AttentionCombiner(
    chemberta_dim=len(chemberta_cols),
    morgan_dim=len(morgan_cols),
    prostt5_dim=len(prostt5_cols),
    tape_dim=len(tape_cols)
)

# Apply attention mechanism to combine embeddings
combined_embeddings = []
with torch.no_grad():
    for i in range(chemberta_tensor.size(0)):
        combined_emb = combiner(
            chemberta_tensor[i].unsqueeze(0),
            morgan_tensor[i].unsqueeze(0),
            prostt5_tensor[i].unsqueeze(0),
            tape_tensor[i].unsqueeze(0)
        ).squeeze(0).cpu().numpy().flatten()
        combined_embeddings.append(combined_emb)

#Save combined embeddings with drug-protein pairs
combined_embeddings_df = pd.DataFrame(combined_embeddings)
combined_embeddings_df.insert(0, 'DrugBank ID', drug_ids)
combined_embeddings_df.insert(1, 'UniProt ID', protein_ids)

# Save the combined embeddings to a CSV file
combined_embeddings_file = os.path.join(working_dir, 'combined_drug_protein_embeddings.csv')
combined_embeddings_df.to_csv(combined_embeddings_file, index=False)

print(f"Combined embeddings saved to: {combined_embeddings_file}")