In [1]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Torch and Transformers
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

# RDKit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Utilities
from tqdm import tqdm


In [2]:
# Load your CSV (upload first or use correct path)
df = pd.read_csv(r"C:\Users\ronit\Desktop\project\Drug-detection-and-Innovation-\data\detection data.csv")  # Update path if needed

# Function to convert SMILES to Morgan fingerprints
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)

# Convert valid SMILES to fingerprints
fingerprints = []
valid_indices = []

for i, smi in enumerate(df['smiles']):
    fp = smiles_to_fingerprint(smi)
    if fp is not None:
        fingerprints.append(np.array(fp))
        valid_indices.append(i)

#  Filter valid rows
df_valid = df.iloc[valid_indices].reset_index(drop=True)
X = np.array(fingerprints)

#  Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_valid['Compound ID'])

#  Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#  Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

#  Fix for classification report
labels_in_test = np.unique(y_test)
target_names = label_encoder.inverse_transform(labels_in_test)
report = classification_report(y_test, y_pred, labels=labels_in_test, target_names=target_names)

#  Show results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

#  Show results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)




Accuracy: 0.00
Classification Report:
                                        precision    recall  f1-score   support

             1,1,1,2-Tetrachloroethane       0.00      0.00      0.00       1.0
          1,2,3,4-Tetrahydronapthalene       0.00      0.00      0.00       1.0
                1,2,3-Trichlorobenzene       0.00      0.00      0.00       1.0
                1,2,4-Trichlorobenzene       0.00      0.00      0.00       1.0
                 1,2,4-tribromobenzene       0.00      0.00      0.00       1.0
                     1,2-Dibromoethane       0.00      0.00      0.00       1.0
                   1,2-Propylene oxide       0.00      0.00      0.00       1.0
               1,3,5-Trimethylbenzene        0.00      0.00      0.00       1.0
                       1,4-Benzenediol       0.00      0.00      0.00       1.0
                    1,4-Dibromobenzene       0.00      0.00      0.00       1.0
                   1,4-Diethylbenzene        0.00      0.00      0.00       1.0
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

#  Recreate fingerprints as RDKit BitVects (not numpy arrays)
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)

#  Recalculate original fingerprints in RDKit format
rdkit_fingerprints = []
valid_indices = []

for i, smi in enumerate(df['smiles']):
    fp = smiles_to_fingerprint(smi)
    if fp is not None:
        rdkit_fingerprints.append(fp)
        valid_indices.append(i)

# Filter dataset
df_valid = df.iloc[valid_indices].reset_index(drop=True)

#  Function to find top-N similar compounds
def find_similar_compounds(smiles_query, top_n=5):
    query_fp = smiles_to_fingerprint(smiles_query)
    if query_fp is None:
        return "❌ Invalid SMILES input."
    
    similarities = []
    for i, fp in enumerate(rdkit_fingerprints):
        sim = DataStructs.TanimotoSimilarity(query_fp, fp)
        similarities.append((sim, df_valid.loc[i, 'Compound ID'], df_valid.loc[i, 'smiles']))
    
    similarities.sort(reverse=True, key=lambda x: x[0])
    
    print(f"\n Top {top_n} matches for SMILES: {smiles_query}")
    for i, (sim, cid, smi) in enumerate(similarities[:top_n]):
        print(f"{i+1}. Compound ID: {cid} | SMILES: {smi} | Similarity: {sim:.4f}")

#  Example usage
find_similar_compounds("c1ccsc1")  # Try with any SMILES!



 Top 5 matches for SMILES: c1ccsc1
1. Compound ID: Thiophene | SMILES: c1ccsc1 | Similarity: 1.0000
2. Compound ID: Benzene  | SMILES: c1ccccc1 | Similarity: 0.2222
3. Compound ID: Dibenzothiophene | SMILES: c1ccc2c(c1)sc3ccccc23 | Similarity: 0.1579
4. Compound ID: aminothiazole | SMILES: Nc1nccs1  | Similarity: 0.1500
5. Compound ID: Pyridazine | SMILES: c1ccnnc1 | Similarity: 0.1429


