In [2]:
import dill
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report, confusion_matrix

class Predictor:
    def __init__(self, model_name, model_path):
        self.model_name = model_name
        # Load model from pickle
        with open(model_path, "rb") as f:
            self.model = dill.load(f)
        self.model.eval()

    def predict_smiles(self, smiles, tokenizer):
        inputs = tokenizer(smiles, truncation=True, padding=True, return_tensors='pt')
        with torch.no_grad():
            outputs = self.model(**inputs)
        logits = outputs["logits"] if "logits" in outputs else outputs[0]
        probabilities = F.softmax(logits, dim=1).squeeze()
        predicted_class = torch.argmax(logits, dim=1).item()
        predicted_probability = probabilities[predicted_class].item()
        return predicted_class, predicted_probability

    def predict_dataset(self, dataset, tokenizer):
        predictions = []
        y_true = []
        y_pred = []
        probabilities = []

        for _, row in dataset.iterrows():
            smiles = row['SMILES']
            label = row.get("label", None)
            predicted_class, predicted_probability = self.predict_smiles(smiles, tokenizer)

            predictions.append({
                "SMILES": smiles,
                "Label": label,
                "Prediction": predicted_class,
                "Probability": round(predicted_probability, 3),
                "Activity_pred": "active" if predicted_class == 1 else "inactive"
            })

            y_true.append(label)
            y_pred.append(predicted_class)
            probabilities.append(predicted_probability)

        results = pd.DataFrame(predictions)
        results.to_csv('{}_Predictions_Results.csv'.format(self.model_name), index=False)     

        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()
        tpr = tp / (tp + fn)  # True Positive Rate
        fpr = fp / (fp + tn)  # False Positive Rate
        metrics = {
            "Accuracy": [np.mean(y_true == y_pred)],
            "Balanced Accuracy": [balanced_accuracy_score(y_true, y_pred)],
            "ROC AUC": [roc_auc_score(y_true, probabilities) if len(set(y_true)) > 1 else None],
            "MCC": [matthews_corrcoef(y_true, y_pred)],
            "Precision": [precision_score(y_true, y_pred, zero_division=0)],
            "Recall": [recall_score(y_true, y_pred, zero_division=0)],
            "F1-Score": [precision_score(y_true, y_pred, zero_division=0)],
            "TPR": [tpr],
            "FPR": [fpr],}

        # Create metrics DataFrame
        df_results = pd.DataFrame(metrics)
        df_results = pd.melt(df_results, var_name='Metrics', value_name='Test_scores')
        df_results.to_csv('{}_External_metric_results.csv'.format(self.model_name), index=False)

        return results, df_results


# ChemBERTa

In [3]:
from transformers import RobertaTokenizerFast

# Load tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('seyonec/SMILES_tokenized_PubChem_shard00_160k')

# Define dataset
dataset = pd.read_csv ("PATH_TO_YOUR_DATA.csv")

predictor = Predictor("ChemBERTa",'ChembBERTa_Trypanosomiasis_1:10.pkl')
predictor.predict_dataset(dataset, tokenizer)

# MolFormer

In [4]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True)

dataset = pd.read_csv ("PATH_TO_YOUR_DATA.csv")

predictor = Predictor("MolFormer",'MolFormer_COVID19_1:10.pkl')
predictor.predict_dataset(dataset, tokenizer)