In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR



In [2]:
class SmilesDataset(Dataset):
    """Custom dataset for SMILES classification."""
    def __init__(self, smiles, labels, max_len=100):
        self.smiles = smiles
        self.labels = labels
        self.max_len = max_len
    
    def __len__(self):
        return len(self.smiles)
    
    def __getitem__(self, idx):
        # Pad SMILES string to max_len
        smile = self.smiles[idx]
        smile_padded = [ord(c) for c in smile] + [0] * (self.max_len - len(smile))
        return torch.tensor(smile_padded, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)



In [3]:
class CustomTransformer(nn.Module):
    """Custom transformer model for SMILES classification without using built-in modules."""
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers, num_classes):
        super(CustomTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 100, embed_dim))
        
        self.layers = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, ff_dim) for _ in range(num_layers)
        ])
        
        self.fc = nn.Linear(embed_dim, num_classes)
    
    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.shape[1], :]
        for layer in self.layers:
            x = layer(x)
        x = x.mean(dim=1)  # Global average pooling
        x = self.fc(x)
        return x



In [4]:
class TransformerBlock(nn.Module):
    """Custom transformer block without built-in modules."""
    def __init__(self, embed_dim, num_heads, ff_dim):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadSelfAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ffn = FeedForwardNetwork(embed_dim, ff_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
    
    def forward(self, x):
        attn_output = self.attention(x)
        x = self.norm1(x + attn_output)
        ffn_output = self.ffn(x)
        x = self.norm2(x + ffn_output)
        return x



In [5]:
class MultiHeadSelfAttention(nn.Module):
    """Custom multi-head self-attention mechanism."""
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.head_dim = embed_dim // num_heads
        
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.fc_out = nn.Linear(embed_dim, embed_dim)
    
    def forward(self, x):
        batch_size, seq_length, embed_dim = x.shape
        Q = self.query(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.key(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.value(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_weights = torch.nn.functional.softmax(attn_scores, dim=-1)
        attn_output = torch.matmul(attn_weights, V).transpose(1, 2).contiguous().view(batch_size, seq_length, embed_dim)
        return self.fc_out(attn_output)



In [6]:
class FeedForwardNetwork(nn.Module):
    """Feedforward network for transformer block."""
    def __init__(self, embed_dim, ff_dim):
        super(FeedForwardNetwork, self).__init__()
        self.fc1 = nn.Linear(embed_dim, ff_dim)
        self.fc2 = nn.Linear(ff_dim, embed_dim)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))



In [7]:
def main():
    dataset_path = r"C:\Users\ronit\Desktop\project\self\ESOL_delaney-processed.csv"  # Change to your dataset file path
    dataset = pd.read_csv(dataset_path)
    dataset = dataset.dropna().reset_index(drop=True)  # Cleaning: Remove NaN values
    
    label_encoder = LabelEncoder()
    dataset['label'] = label_encoder.fit_transform(dataset['Compound ID'])
    
    max_len = 100  # Maximum SMILES string length
    # Splitting dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(dataset['smiles'], dataset['label'], test_size=0.2, random_state=42)
    
    train_dataset = SmilesDataset(X_train.tolist(), y_train.tolist(), max_len)
    test_dataset = SmilesDataset(X_test.tolist(), y_test.tolist(), max_len)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    vocab_size = 128  # Assuming ASCII characters for SMILES representation
    model = CustomTransformer(vocab_size, embed_dim=256, num_heads=8, ff_dim=512, num_layers=4, num_classes=len(dataset['Compound ID'].unique()))
    model.to(device)
    
    optimizer = optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-5)  # AdamW with weight decay for better generalization
    scheduler = StepLR(optimizer, step_size=5, gamma=0.5)  # Step learning rate decay every 5 epochs
    loss_fn = nn.CrossEntropyLoss()
    
    for epoch in range(20):  # Increase epochs for better training
        model.train()
        total_loss = 0
        for smiles, labels in train_loader:
            labels = labels.to(device)
            optimizer.zero_grad()
            smiles = smiles.to(device)
            outputs = model(smiles)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        scheduler.step()  # Update learning rate scheduler
        
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")
        
        # Validation accuracy (optional, but helps track model performance)
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for smiles, labels in test_loader:
                smiles = smiles.to(device)
                labels = labels.to(device)
                outputs = model(smiles)
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        print(f"Epoch {epoch + 1}, Test Accuracy: {100 * correct / total:.2f}%")
    
    input_smiles = input("Enter SMILES string: ")
    model.eval()
    # Padding the input SMILES string
    input_smiles_padded = [ord(c) for c in input_smiles] + [0] * (max_len - len(input_smiles))
    inputs = torch.tensor([input_smiles_padded], dtype=torch.long).to(device)
    with torch.no_grad():
        outputs = model(inputs)
        predicted_label = torch.argmax(outputs, dim=1).item()
    print(f"Predicted Compound ID: {label_encoder.inverse_transform([predicted_label])[0]}")

if __name__ == "__main__":
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ronit\\Desktop\\project\\self\\ESOL_delaney-processed.csv'

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load ChemBERTa Tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

# Custom Dataset
class SmilesDataset(Dataset):
    def __init__(self, smiles, labels):
        self.smiles = smiles
        self.labels = labels
    
    def __len__(self):
        return len(self.smiles)
    
    def __getitem__(self, idx):
        encoding = tokenizer(self.smiles[idx], padding='max_length', truncation=True, max_length=100, return_tensors='pt')
        return encoding['input_ids'].squeeze(0), encoding['attention_mask'].squeeze(0), torch.tensor(self.labels[idx], dtype=torch.long)

# Load Data
def load_data(file_path):
    dataset = pd.read_csv(file_path).dropna().reset_index(drop=True)
    label_encoder = LabelEncoder()
    dataset['label'] = label_encoder.fit_transform(dataset['Compound ID'])
    return dataset, label_encoder

# Train Model
def train_model(model, train_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask).logits
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluate Model
def evaluate_model(model, test_loader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in test_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask).logits
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total

# Main Function
def main():
    dataset_path = "C:/Users/ronit/Desktop/project/self/Drug-detection-and-Innovation-/data/detection data.csv"
    dataset, label_encoder = load_data(dataset_path)
    
    X_train, X_test, y_train, y_test = train_test_split(dataset['smiles'], dataset['label'], test_size=0.2, random_state=42)
    train_dataset = SmilesDataset(X_train.tolist(), y_train.tolist())
    test_dataset = SmilesDataset(X_test.tolist(), y_test.tolist())
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForSequenceClassification.from_pretrained("seyonec/ChemBERTa-zinc-base-v1", num_labels=len(dataset['Compound ID'].unique())).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=1e-5)
    loss_fn = nn.CrossEntropyLoss()
    
    for epoch in range(10):
        train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
        test_accuracy = evaluate_model(model, test_loader, device)
        print(f"Epoch {epoch+1}, Loss: {train_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
    
    # Prediction
    input_smiles = input("Enter SMILES string: ")
    encoding = tokenizer(input_smiles, padding='max_length', truncation=True, max_length=100, return_tensors='pt')
    input_ids, attention_mask = encoding['input_ids'].to(device), encoding['attention_mask'].to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask).logits
        predicted_label = torch.argmax(outputs, dim=1).item()
    print(f"Predicted Compound ID: {label_encoder.inverse_transform([predicted_label])[0]}")

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/501 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/9.43k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/179M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/179M [00:00<?, ?B/s]

Epoch 1, Loss: 7.0779, Test Accuracy: 0.00%
Epoch 2, Loss: 6.9888, Test Accuracy: 0.00%
Epoch 3, Loss: 6.8945, Test Accuracy: 0.00%
Epoch 4, Loss: 6.8266, Test Accuracy: 0.00%
