In [None]:
! pip install pandas transformers datasets sklearn

<h5>Existing molecule processing:<h5>

Three labels:

In [None]:
import pandas as pd
import random
import os


df = pd.read_csv('CHembl training set.csv')
df = df[['Smiles', 'Comment', 'IC50']]

def categorize_activity(value):
    if value == "very active":
        return 2
    elif value == "moderately active":
        return 1
    else:
        return 0

df['label'] = df['Comment'].apply(categorize_activity)

df.rename(columns={'Smiles': 'text'}, inplace=True)
df.drop(columns=['Comment'], inplace=True)


if not os.path.exists(f'./traindata'):
    os.makedirs(f'./traindata')

df.to_csv('traindata/train_with_IC50.csv', index=False)

df = df[['text', 'label']]
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df['label'].value_counts()[2])
print(df['label'].value_counts()[1])
print(df['label'].value_counts()[0])

df.to_csv('traindata/train_withduplicates.csv', index=False)

df = df.drop_duplicates(subset=["text"], keep="first")
df = df[['text', 'label']]

print(df['label'].value_counts()[2])
print(df['label'].value_counts()[1])
print(df['label'].value_counts()[0])

df.to_csv('traindata/train_shuffled.csv', index=False)

# Method for dividing the data into three datasets for training, testing, and validation
def split_dataset(df):
    train_samples, test_samples, val_samples = [], [], []
    very_active_samples = df[df['label'] == 2].values.tolist()
    moderately_active_samples = df[df['label'] == 1].values.tolist()
    inactive_samples = df[df['label'] == 0].values.tolist()

    random.shuffle(very_active_samples)
    random.shuffle(moderately_active_samples)
    random.shuffle(inactive_samples)

    for samples in [very_active_samples, moderately_active_samples, inactive_samples]:
        train_size = int(len(samples) * 0.8)
        test_size = int(len(samples) * 0.1) 

        train_samples.extend(samples[:train_size])
        test_samples.extend(samples[train_size:train_size + test_size])
        val_samples.extend(samples[train_size + test_size:])

    return train_samples, test_samples, val_samples


# Split and balance the dataset three times
for j in range(2):
    for i in range(3):
        train_samples, test_samples, val_samples = split_dataset(df)
        
        if not os.path.exists(f'./traindata{j}'):
            os.makedirs(f'./traindata{j}')

        # Write the selected samples to separate CSV files
        random.shuffle(train_samples)
        train_df = pd.DataFrame(train_samples, columns=df.columns)
        train_df.to_csv(f'traindata{j}/train_data_{i}.csv', index=False)

        print(train_df['label'].value_counts()[2])
        print(train_df['label'].value_counts()[1])
        print(train_df['label'].value_counts()[0])

        random.shuffle(test_samples)
        test_df = pd.DataFrame(test_samples, columns=df.columns)
        test_df.to_csv(f'traindata{j}/test_data_{i}.csv', index=False)

        print(test_df['label'].value_counts()[2])
        print(test_df['label'].value_counts()[1])
        print(test_df['label'].value_counts()[0])

        random.shuffle(val_samples)
        val_df = pd.DataFrame(val_samples, columns=df.columns)
        val_df.to_csv(f'traindata{j}/val_data_{i}.csv', index=False)

        print(val_df['label'].value_counts()[2])
        print(val_df['label'].value_counts()[1])
        print(val_df['label'].value_counts()[0])


def split_final_dataset(df):
    # Split the dataset into lists based on the classification labels
    very_active_samples = df[df['label'] == 2].values.tolist()
    moderately_active_samples = df[df['label'] == 1].values.tolist()
    inactive_samples = df[df['label'] == 0].values.tolist()

    # Shuffle each list
    random.shuffle(very_active_samples)
    random.shuffle(moderately_active_samples)
    random.shuffle(inactive_samples)

    # Create separate lists for training, testing, and validation sets, selecting samples while maintaining balance
    train_samples = []
    val_samples = []

    for samples in [very_active_samples, moderately_active_samples, inactive_samples]:
        train_size = int(len(samples) * 0.8)

        train_samples.extend(samples[:train_size])
        val_samples.extend(samples[train_size:])

    return train_samples, val_samples


# Split and balance the dataset three times
train_samples, val_samples = split_final_dataset(df)
    
# Write the selected samples to separate CSV files
random.shuffle(train_samples)
train_df = pd.DataFrame(train_samples, columns=df.columns)
train_df.to_csv(f'traindata/train_data.csv', index=False)

print(train_df['label'].value_counts()[2])
print(train_df['label'].value_counts()[1])
print(train_df['label'].value_counts()[0])

random.shuffle(val_samples)
val_df = pd.DataFrame(val_samples, columns=df.columns)
val_df.to_csv(f'traindata/val_data.csv', index=False)

print(val_df['label'].value_counts()[2])
print(val_df['label'].value_counts()[1])
print(val_df['label'].value_counts()[0])

Two labels:

In [None]:
import pandas as pd
import random


df = pd.read_csv('CHembl training set.csv')
df = df[['Smiles', 'Comment', 'IC50']]

def categorize_activity(value):
    if value == "inactive":
        return 0
    else:
        return 1

# Create the label column from comments
df['label'] = df['Comment'].apply(categorize_activity)

df.rename(columns={'Smiles': 'text'}, inplace=True)
df.drop(columns=['Comment'], inplace=True)

df.to_csv('traindata/train2_with_IC50.csv', index=False)

df = df[['text', 'label']]
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.to_csv('traindata/train2_withduplicates.csv', index=False)

df = df.drop_duplicates(subset=["text"], keep="first")

print(df['label'].value_counts()[1])
print(df['label'].value_counts()[0])

df = df[['text', 'label']]
df.to_csv('traindata/train2_shuffled.csv', index=False)

def split_dataset(df):
    # Split the dataset into lists based on the classification labels
    active_samples = df[df['label'] == 1].values.tolist()
    inactive_samples = df[df['label'] == 0].values.tolist()

    # Shuffle each list
    random.shuffle(active_samples)
    random.shuffle(inactive_samples)

    # Create separate lists for training, testing, and validation sets, selecting samples while maintaining balance
    train_samples = []
    val_samples = []

    for samples in [active_samples, inactive_samples]:
        train_size = int(len(samples) * 0.8)

        train_samples.extend(samples[:train_size])
        val_samples.extend(samples[train_size:])

    return train_samples, val_samples


# Split and balance the dataset
train_samples, val_samples = split_dataset(df)
    
# Write the selected samples to separate CSV files
random.shuffle(train_samples)
train_df = pd.DataFrame(train_samples, columns=df.columns)
train_df.to_csv(f'traindata/train_data2.csv', index=False)

print(train_df['label'].value_counts()[1])
print(train_df['label'].value_counts()[0])

random.shuffle(val_samples)
val_df = pd.DataFrame(val_samples, columns=df.columns)
val_df.to_csv(f'traindata/val_data2.csv', index=False)

print(val_df['label'].value_counts()[1])
print(val_df['label'].value_counts()[0])

<h5>Generated molecule processing:<h5>

Three labels:

In [None]:
import pandas as pd
import random
import os


df = pd.read_excel('generatedmol.xlsx')

# Remove invalid molecules
df = df[df['Upper boundary for estimated affinity [nM]'] != 0]

# Define a function to categorize activity
def categorize_activity(value):
    if value < 250:
        return 2
    elif 250 <= value <= 5000:
        return 1
    else:
        return 0
    

# Create the label column based on estimated affinity values
df['label'] = df['Upper boundary for estimated affinity [nM]'].apply(categorize_activity)
df.rename(columns={'USMILES': 'text'}, inplace=True)

df['IC50'] = df['Upper boundary for estimated affinity [nM]']
df = df[['text', 'label', 'IC50']]

print(df['label'].value_counts()[2])
print(df['label'].value_counts()[1])
print(df['label'].value_counts()[0])

df = df.drop_duplicates(subset=["text"], keep="first")

if not os.path.exists(f'./testdata'):
    os.makedirs(f'./testdata')

df.to_csv('testdata/testset_IC50.csv', index=False)

# Drop unnecessary columns
df.drop(columns=['IC50'], inplace=True)
df = df[['text', 'label']]

print(df['label'].value_counts()[2])
print(df['label'].value_counts()[1])
print(df['label'].value_counts()[0])

df.to_csv('testdata/testset_noduplicates.csv', index=False)

very_active_samples = df[df['label'] == 2].values.tolist()
moderately_active_samples = df[df['label'] == 1].values.tolist()
inactive_samples = df[df['label'] == 0].values.tolist()

inactive_samples = random.sample(inactive_samples, len(very_active_samples) + len(moderately_active_samples))

random.shuffle(very_active_samples)
random.shuffle(moderately_active_samples)
random.shuffle(inactive_samples)

test_samples = []
val_samples = []
all_samples = []

for samples in [very_active_samples, moderately_active_samples, inactive_samples]:
    size = int(len(samples) * 0.5)

    test_samples.extend(samples[:size])
    val_samples.extend(samples[size:])
    all_samples.extend(samples)


random.shuffle(all_samples)
fullset_df = pd.DataFrame(all_samples, columns= df.columns)
fullset_df.to_csv('testdata/all_synthesized.csv', index=False)
print(fullset_df['label'].value_counts()[2])
print(fullset_df['label'].value_counts()[1])
print(fullset_df['label'].value_counts()[0])

random.shuffle(test_samples)
test_df = pd.DataFrame(test_samples, columns=df.columns)
test_df.to_csv('testdata/testdata.csv', index=False)

print(test_df['label'].value_counts()[2])
print(test_df['label'].value_counts()[1])
print(test_df['label'].value_counts()[0])

random.shuffle(val_samples)
val_df = pd.DataFrame(val_samples, columns=df.columns)
val_df.to_csv('testdata/valdata.csv', index=False)

print(val_df['label'].value_counts()[2])
print(val_df['label'].value_counts()[1])
print(val_df['label'].value_counts()[0])

Two labels:

In [None]:
import pandas as pd
import random


df = pd.read_excel('generatedmol.xlsx')

# Remove invalid molecules
df = df[df['Upper boundary for estimated affinity [nM]'] != 0]

# Define a function to categorize activity
def categorize_activity(value):
    if value >= 5000:
        return 0
    else:
        return 1


# Create the label column based on estimated affinity values
df['label'] = df['Upper boundary for estimated affinity [nM]'].apply(categorize_activity)

df.rename(columns={'USMILES': 'text'}, inplace=True)

df['IC50'] = df['Upper boundary for estimated affinity [nM]']
df = df[['text', 'label', 'IC50']]

print(df['label'].value_counts()[1])
print(df['label'].value_counts()[0])

df = df.drop_duplicates(subset=["text"], keep="first")
df.to_csv('testdata/testset2_IC50.csv', index=False)

# Drop unneccessary columns
df.drop(columns=['IC50'], inplace=True)
df = df[['text', 'label']]

print(df['label'].value_counts()[1])
print(df['label'].value_counts()[0])

# Save the modified DataFrame back to a csv file
df.to_csv('testdata/testset2_noduplicates.csv', index=False)

active_samples = df[df['label'] == 1].values.tolist()
inactive_samples = df[df['label'] == 0].values.tolist()

inactive_samples = random.sample(inactive_samples, len(active_samples))

random.shuffle(active_samples)
random.shuffle(inactive_samples)

test_samples = []
val_samples = []
all_samples = []

for samples in [active_samples, inactive_samples]:
    size = int(len(samples) * 0.5)

    test_samples.extend(samples[:size])
    val_samples.extend(samples[size:])
    all_samples.extend(samples)


random.shuffle(all_samples)
fullset_df = pd.DataFrame(all_samples, columns= df.columns)
fullset_df.to_csv('testdata/all_synthesized2.csv', index=False)

print(fullset_df['label'].value_counts()[1])
print(fullset_df['label'].value_counts()[0])

random.shuffle(test_samples)
test_df = pd.DataFrame(test_samples, columns=df.columns)
test_df.to_csv('testdata/testdata2.csv', index=False)

print(test_df['label'].value_counts()[1])
print(test_df['label'].value_counts()[0])

random.shuffle(val_samples)
val_df = pd.DataFrame(val_samples, columns=df.columns)
val_df.to_csv('testdata/valdata2.csv', index=False)

print(val_df['label'].value_counts()[1])
print(val_df['label'].value_counts()[0])

<h5>Confirmed generated inhibitor data processing:<h5>

Three labels:

In [None]:
import pandas as pd


df = pd.read_excel('goodmol.xlsx')

# Create the label column based on estimated affinity values
def categorize_activity(value):
    if value < 250:
        return 2
    elif 250 <= value <= 5000:
        return 1
    else:
        return 0

# Convert the labels
df['label'] = df['Upper boundary for estimated affinity [nM]'].apply(categorize_activity)
df.rename(columns={'USMILES': 'text'}, inplace=True)

df['IC50'] = df['Upper boundary for estimated affinity [nM]']
df = df[['text', 'label', 'IC50']]

df.to_csv('mols_IC50.csv', index=False)

# Drop unneccessary columns
df.drop(columns=['IC50'], inplace=True)
df = df[['text', 'label']]

# Save the modified DataFrame back to a csv file
df.to_csv('mols.csv', index=False)

Two labels:

In [None]:
import pandas as pd


df = pd.read_excel('goodmol.xlsx')

# Define a function to categorize activity
def categorize_activity(value):
    if value >= 5000:
        return 0
    else:
        return 1

# Create the label column based on estimated affinity values
df['label'] = df['Upper boundary for estimated affinity [nM]'].apply(categorize_activity)

df.rename(columns={'USMILES': 'text'}, inplace=True)

df['IC50'] = df['Upper boundary for estimated affinity [nM]']
df = df[['text', 'label', 'IC50']]

df.to_csv('mols2_IC50.csv', index=False)

# Drop unneccessary columns
df.drop(columns=['IC50'], inplace=True)
df = df[['text', 'label']]

# Save the modified DataFrame back to a csv file
df.to_csv('mols2.csv', index=False)

<h5>Initial model variation selection training:<h5>

1. Choose pre-trained model for comparing the pre-training objectives...

In [None]:
model_name = "DeepChem/ChemBERTa-5M-MLM"
# model_name = "DeepChem/ChemBERTa-5M-MTR"
# model_name = "UdS-LSV/smole-bert-mtr"
# model_name = "UdS-LSV/smole-bert"

... or comparing dataset variations:

In [None]:
model_name = "seyonec/ChemBERTa-zinc-base-v1"
model_name = "seyonec/ChemBERTa-zinc250k-v1"
model_name = "seyonec/PubChem10M_SMILES_BPE_60k"

model_name = "DeepChem/ChemBERTa-5M-MLM"
model_name = "DeepChem/ChemBERTa-10M-MLM"
model_name = "DeepChem/ChemBERTa-77M-MLM"

2. Define the training, validation, and testing datasets used, as well as a run name for clarity:

In [None]:
import pandas as pd

run_name = "model 1 dataset 1 run 1"

train_df = pd.read_csv("traindata0/train_data_0.csv")
test_df = pd.read_csv("traindata0/test_data_0.csv")
val_df = pd.read_csv("traindata0/val_data_0.csv")

# train_df = pd.read_csv("traindata0/train_data_1.csv")
# test_df = pd.read_csv("traindata0/test_data_1.csv")
# val_df = pd.read_csv("traindata0/val_data_1.csv")

# train_df = pd.read_csv("traindata0/train_data_2.csv")
# test_df = pd.read_csv("traindata0/test_data_2.csv")
# val_df = pd.read_csv("traindata0/val_data_2.csv")

3. Tokenise the data and fine-tune the model:

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset

# Tokenization
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_smiles(example):
    return tokenizer(example['text'], truncation=True, padding=True)

train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.map(tokenize_smiles, batched=True)

# Prepare testing dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_smiles, batched=True)

# Prepare validation dataset
val_dataset = Dataset.from_pandas(val_df)
val_dataset = val_dataset.map(tokenize_smiles, batched=True)


# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    disable_tqdm=True,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
)

# Define function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, 
                                                        average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

print("\n_________" + run_name + " TRAINING FINISHED____________\n")

5. Evaluation of the fine-tuned model:

In [None]:
# Evaluate the model on the test set
print("Evaluation results:")
eval_results = trainer.evaluate(eval_dataset=test_dataset)

# Get predictions on test set
predictions = trainer.predict(test_dataset)

test_labels = test_dataset["label"]
test_preds = predictions.predictions.argmax(axis=1)
accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='weighted')

# Print evaluation results
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test F1 Score:", f1)

# Print some data samples along with their corresponding labels and predictions
for i in range(5):
    print("Data:", test_dataset[i]["text"])
    print("True Label:", test_labels[i])
    print("Predicted Label:", test_preds[i])
    print()

<h5>Fine-tuning and evaluation of different pre-trained models (using chosen variations):<h5>

1. Choose and define the pre-trained model:

In [None]:
model_name = "seyonec/ChemBERTa-zinc250k-v1"
# model_name = "DeepChem/ChemBERTa-5M-MLM"
# model_name = "jonghyunlee/ChemBERT_ChEMBL_pretrained"
# model_name = "UdS-LSV/smole-bert"

2. Choose the datasets used and define the run name for clarity:

In [None]:
import pandas as pd

run_name = "model 1 data 1 run 1"

train_df = pd.read_csv("traindata1/train_data_0.csv")
test_df = pd.read_csv("traindata1/test_data_0.csv")
val_df = pd.read_csv("traindata1/val_data_0.csv")

# train_df = pd.read_csv("traindata1/train_data_1.csv")
# test_df = pd.read_csv("traindata1/test_data_1.csv")
# val_df = pd.read_csv("traindata1/val_data_1.csv")

# train_df = pd.read_csv("traindata1/train_data_2.csv")
# test_df = pd.read_csv("traindata1/test_data_2.csv")
# val_df = pd.read_csv("traindata1/val_data_2.csv")

3. Tokenise the data and fine-tune the model:

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_smiles(example):
    return tokenizer(example['text'], truncation=True, padding=True)

train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.map(tokenize_smiles, batched=True)

# Prepare testing dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_smiles, batched=True)

# Prepare validation dataset
val_dataset = Dataset.from_pandas(val_df)
val_dataset = val_dataset.map(tokenize_smiles, batched=True)

# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    disable_tqdm=True,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
)

# Define function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

print("\n_________" + run_name + " TRAINING FINISHED____________\n")

4. Evaluate the fine-tuned model:

In [None]:
import csv
import os

# Evaluate the model on the test set
print("Evaluation results:")
eval_results = trainer.evaluate(eval_dataset=test_dataset)

# Get predictions on test set
predictions = trainer.predict(test_dataset)

test_labels = test_dataset["label"]
test_preds = predictions.predictions.argmax(axis=1)
accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='weighted')

# Print evaluation results
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test F1 Score:", f1)

# Print some data samples along with their corresponding labels and predictions
for i in range(5):  
    print("Data:", test_dataset[i]["text"])
    print("True Label:", test_labels[i])
    print("Predicted Label:", test_preds[i])
    print()

if not os.path.exists("./predictions/"):
    os.makedirs("./predictions/")

with open("./predictions/" + run_name.replace(" ", "") + "_predictions.csv", "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["Text", "True Label", "Predicted Label"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write the header
    writer.writeheader()

    # Write predictions for each example in the test set
    for i in range(len(predictions.predictions)):
        writer.writerow({
            "Text": test_dataset[i]["text"],
            "True Label": test_labels[i],
            "Predicted Label": test_preds[i]
        })


<h5>Processing and evaluation of the collected model prediction data:<h5>

In [None]:
import pandas as pd
import os

prediction_folder = "./predictions/"
ic50_folder =  prediction_folder + "withIC50/"
processed_folder = ic50_folder + "processed/"

original_data_df = pd.read_csv("traindata/train_with_IC50.csv")

# Create a dictionary mapping text to IC50 values
text_ic50_map = dict(zip(original_data_df["text"], original_data_df["IC50"]))

# Function to retrieve IC50 value based on text
def get_ic50(text):
    return text_ic50_map.get(text, "")

# Apply processing to every file in predictions directory
for file_name in os.listdir(prediction_folder):
    if file_name.endswith("_predictions.csv"):
        # Extract run_name from file_name
        run_name = file_name.split("_predictions.csv")[0]
        predictions_df = pd.read_csv(prediction_folder + run_name + "_predictions.csv")
        # Save predictions with IC50 values
        predictions_df["IC50"] = predictions_df["Text"].apply(get_ic50)

        if not os.path.exists(ic50_folder):
            os.makedirs(ic50_folder)

        predictions_df.to_csv(ic50_folder + run_name + "_predictions_with_ic50.csv", index=False)


# Function to calculate the difference between IC50 and label bound
def calculate_ic50_difference(row):
    bound = 5000 if (row['Predicted Label'] == 0 or (row['Predicted Label'] == 1 and row["True Label"] == 0)) else 250
    return abs(int(row['IC50']) - bound)
    

for file_name in os.listdir(ic50_folder):
    if file_name.endswith("_with_ic50.csv"):
        df = pd.read_csv(ic50_folder + file_name)

        # Filter rows where predicted label does not match real label
        mismatched_df = df[df['Predicted Label'] != df['True Label']].copy()

        # Calculate IC50 difference
        differences = mismatched_df.apply(calculate_ic50_difference, axis=1)
        mismatched_df['IC50 Difference'] = differences.tolist()

        # Group by model
        model_number = file_name.split("l")[1][0]

        if not os.path.exists(processed_folder):
            os.makedirs(processed_folder)

        output_file_path = processed_folder + "model" + model_number + "_predictions.csv"

        # Append to the existing file if it exists, otherwise create a new file
        if os.path.exists(output_file_path):
            mismatched_df.to_csv(output_file_path, mode='a', index=False, header=False)
        else:
            mismatched_df.to_csv(output_file_path, index=False)



for file_name in os.listdir(processed_folder):
    if file_name.endswith("_predictions.csv"):
        df = pd.read_csv(processed_folder + file_name)

        df["Duplicate Count"] = df.groupby(df.columns[0]).transform('size').tolist()
        model_df = df.drop_duplicates(keep='first').copy()
        model_df.to_csv(processed_folder + file_name.split(".")[0] + "_final.csv", index=False)

        summary_df = pd.DataFrame(columns=['Model', 'Incorrect Predictions', 'Off by Two', 'Relative Grade'])

        summary = {
            'Model': file_name.split("l")[1][0],
            'Incorrect Predictions': len(df),
            'Off by Two': sum(abs(df['True Label'] - df['Predicted Label']) > 1),
            'Relative Grade': sum((model_df['IC50 Difference'] / (model_df['IC50'] + model_df['IC50 Difference'])) * model_df['Duplicate Count']) / 100,
        }
        summary_df.loc[1] = summary

        if os.path.exists(processed_folder + "model_summary.csv"):
            summary_df.to_csv(processed_folder + "model_summary.csv", mode='a', index=False, header=False)
        else:
            summary_df.to_csv(processed_folder + "model_summary.csv", index=False)

print("Processing completed.")

<h5>Fine-tuning the final model:<h5>

Without parameter optimization:

1. Load and define the pre-trained model, specifying the respective number of labels used:

>1.1. For three labels:
    

In [None]:
from transformers import AutoModelForSequenceClassification

model_name = "seyonec/ChemBERTa-zinc250k-v1"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

>1.2 For two labels:

In [None]:
from transformers import AutoModelForSequenceClassification

model_name = "seyonec/ChemBERTa-zinc250k-v1"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

2. Choose and load the dataset combination for training and evaluation:

>2.1 For three labels:

In [None]:
train_df = pd.read_csv("traindata/train_shuffled.csv")
test_df = pd.read_csv("testdata/valdata.csv")
val_df = pd.read_csv("testdata/testdata.csv")

# train_df = pd.read_csv("traindata/train_withduplicates.csv")
# test_df = pd.read_csv("testdata/valdata.csv")
# val_df = pd.read_csv("testdata/testdata.csv")

# train_df = pd.read_csv("traindata/final_train.csv")
# test_df = pd.read_csv("testdata/final_val.csv")
# val_df = pd.read_csv("testdata/all_generated.csv")

mol_df = pd.read_csv("mols.csv")

>2.2 For two labels:

In [None]:
train_df = pd.read_csv("traindata/train2_shuffled.csv")
test_df = pd.read_csv("data3/valdata2.csv")
val_df = pd.read_csv("data3/testdata2.csv")

# train_df = pd.read_csv("traindata/train2_withduplicates.csv")
# test_df = pd.read_csv("testdata/valdata2.csv")
# val_df = pd.read_csv("testdata/testdata2.csv")

# train_df = pd.read_csv("traindata/final_train2.csv")
# test_df = pd.read_csv("traindata/final_val2.csv")
# val_df = pd.read_csv("testdata/all_generated2.csv")

mol_df = pd.read_csv("mols2.csv")

3. Define the compute_metrics method for handling the different number of labels:

>3.1 For three labels:

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

>3.2 For two labels:

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

4. Tokenise the data and fine-tune the model:

In [None]:
from transformers import TrainingArguments, Trainer, AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_smiles(example):
    return tokenizer(example['text'], truncation=True, padding=True)

train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.map(tokenize_smiles, batched=True)

# Prepare testing dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_smiles, batched=True)

# Prepare validation dataset
val_dataset = Dataset.from_pandas(val_df)
val_dataset = val_dataset.map(tokenize_smiles, batched=True)

mol_dataset = Dataset.from_pandas(mol_df)
mol_dataset = mol_dataset.map(tokenize_smiles, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results1',
    overwrite_output_dir=True,
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    disable_tqdm=True,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

print("\n_________" + run_name + " TRAINING FINISHED____________\n")

5. Evaluate the fine-tuned model on testing data with respective techniques for different numbers of labels

>5.1 Three labels:

In [None]:
print("Evaluation results:")
eval_results = trainer.evaluate(eval_dataset=test_dataset)

# Get predictions on test set
predictions = trainer.predict(test_dataset)

test_labels = test_dataset["label"]
test_preds = predictions.predictions.argmax(axis=1)
accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='weighted')

# Print evaluation results
print("\nTest Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test F1 Score:", f1, "\n")

# Print some data samples along with their corresponding labels and predictions
for i in range(5):
    print("Data:", test_dataset[i]["text"])
    print("True Label:", test_labels[i])
    print("Predicted Label:", test_preds[i])
    print()

mol_predictions = trainer.predict(mol_dataset)

mol_labels = mol_dataset["label"]
mol_preds = mol_predictions.predictions.argmax(axis=1)
accuracy = accuracy_score(mol_labels, mol_preds)
precision, recall, f1, _ = precision_recall_fscore_support(mol_labels, mol_preds, average='weighted', zero_division=0)

print("\nTest Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test F1 Score:", f1)

for i in range(9):
    print("Data:", mol_dataset[i]["text"])
    print("True Label:", mol_labels[i])
    print("Predicted Label:", mol_preds[i])
    print()

>5.2 Two labels:

In [None]:
print("Evaluation results:")
eval_results = trainer.evaluate(eval_dataset=test_dataset)

# Get predictions on test set
predictions = trainer.predict(test_dataset)

test_labels = test_dataset["label"]
test_preds = predictions.predictions.argmax(axis=1)
accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='binary')

# Print evaluation results
print("\nTest Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test F1 Score:", f1, "\n")

# Print some data samples along with their corresponding labels and predictions
for i in range(5):
    print("Data:", test_dataset[i]["text"])
    print("True Label:", test_labels[i])
    print("Predicted Label:", test_preds[i])
    print()

mol_predictions = trainer.predict(mol_dataset)

mol_labels = mol_dataset["label"]
mol_preds = mol_predictions.predictions.argmax(axis=1)
accuracy = accuracy_score(mol_labels, mol_preds)
precision, recall, f1, _ = precision_recall_fscore_support(mol_labels, mol_preds, average='binary', zero_division=1)

print("\nTest Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test F1 Score:", f1)

for i in range(9):
    print("Data:", mol_dataset[i]["text"])
    print("True Label:", mol_labels[i])
    print("Predicted Label:", mol_preds[i])
    print()

<h5> Model fine-tuning with hyperparameter optimisation:<h5>

1. Define the model, import and tokenize datasets, and define compute_metrics:

In [None]:
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from datasets import Dataset


model_name = "seyonec/ChemBERTa-zinc250k-v1"

train_df = pd.read_csv("traindata/train2_shuffled.csv")
val_df = pd.read_csv("testdata/valdata2.csv")
test_df = pd.read_csv("testdata/testdata2.csv")

mol_df = pd.read_csv("mols2.csv")

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_smiles(example):
    return tokenizer(example['text'], truncation=True, padding=True)

# Prepare datasets
train_dataset = Dataset.from_pandas(train_df).map(tokenize_smiles, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_smiles, batched=True)
test_dataset = Dataset.from_pandas(test_df).map(tokenize_smiles, batched=True)
mol_dataset = Dataset.from_pandas(mol_df).map(tokenize_smiles, batched=True)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

2. Tokenise the data, and fine-tune the model with hyperparameter search, or without for comparison:

>2.1. Fine-tuning with hyperparameter search:

In [None]:
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer
import optuna


# Method for model initialization
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Hyperparameter search space
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 15),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    disable_tqdm=True,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    logging_dir='./logs',
)

# Initialize Trainer with hyperparameter search
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

best_run = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    n_trials=10,
)

# Print the best hyperparameters
print("Best Hyperparameters found:", best_run)

print("\n_________" + run_name + " TRAINING FINISHED____________\n")

>2.2. Fine-tuning without hyperparameter search:

In [None]:
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer


model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results1',
    overwrite_output_dir=True,
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    disable_tqdm=True,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

print("\n_________" + run_name + " TRAINING FINISHED____________\n")

3. Evaluate the fine-tuned model using the testing datasets, and for each dataset produce the prediction accuracy, F1, precision-recall curve, and confusion matrix:

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_curve, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt

print("Evaluation results:")
eval_results = trainer.evaluate(eval_dataset=test_dataset)

# Get predictions on test set
predictions = trainer.predict(test_dataset)

test_labels = test_dataset["label"]
test_preds = predictions.predictions.argmax(axis=1)
accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='binary')

# Compute precision-recall curve
precision_curve, recall_curve, _ = precision_recall_curve(test_labels, predictions.predictions[:, 1])

# Plot precision-recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall_curve, precision_curve, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.show()

# Plot confusion matrix
cm = confusion_matrix(test_labels, test_preds)
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.xticks([0, 1], ['Class 0', 'Class 1'])
plt.yticks([0, 1], ['Class 0', 'Class 1'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

# Print evaluation results
print("\nTest Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test F1 Score:", f1, "\n")

# Print some data samples along with their corresponding labels and predictions
for i in range(5): 
    print("Data:", test_dataset[i]["text"])
    print("True Label:", test_labels[i])
    print("Predicted Label:", test_preds[i])
    print()

mol_predictions = trainer.predict(mol_dataset)

mol_labels = mol_dataset["label"]
mol_preds = mol_predictions.predictions.argmax(axis=1)
accuracy = accuracy_score(mol_labels, mol_preds)
precision, recall, f1, _ = precision_recall_fscore_support(mol_labels, mol_preds, average='binary', zero_division=1)

# Compute precision-recall curve for mols set
precision_curve, recall_curve, _ = precision_recall_curve(mol_labels, mol_predictions.predictions[:, 1])

# Plot precision-recall curve for mols set
plt.figure(figsize=(8, 6))
plt.plot(recall_curve, precision_curve, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Mols Set')
plt.grid(True)
plt.show()

# Plot confusion matrix for mols set
cm = confusion_matrix(mol_labels, mol_preds)
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix for Mols Set')
plt.colorbar()
plt.xticks([0, 1], ['Class 0', 'Class 1'])
plt.yticks([0, 1], ['Class 0', 'Class 1'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

print("\nTest Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test F1 Score:", f1)

for i in range(9): 
    print("Data:", mol_dataset[i]["text"])
    print("True Label:", mol_labels[i])
    print("Predicted Label:", mol_preds[i])
    print()

<h5>Final model training and saving:<h5>

In [None]:
import pandas as pd
from sklearn.utils import shuffle


# Load datasets as DFs
train_df = pd.read_csv('traindata/train2_shuffled.csv')
mols_df = pd.read_csv('mols2.csv')

# Combine and shuffle datasets
combined_df = pd.concat([train_df, mols_df])
shuffled_df = shuffle(combined_df).reset_index(drop=True)

shuffled_df.to_csv('finalset.csv', index=False)

In [None]:
from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from datasets import Dataset

run_name = "ChemBERTaNLRP3"
model_name = "seyonec/ChemBERTa-zinc250k-v1"

train_df = pd.read_csv("finalset.csv")
val_df = pd.read_csv("testdata/valdata2.csv")

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_smiles(example):
    return tokenizer(example['text'], truncation=True, padding=True)

# Prepare datasets
train_dataset = Dataset.from_pandas(train_df).map(tokenize_smiles, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_smiles, batched=True)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./finalresults',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    logging_dir='./finallogs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    disable_tqdm=True,
    save_strategy="epoch",
    load_best_model_at_end=True,
    run_name=run_name,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

print("\n_________" + run_name + " TRAINING FINISHED____________\n")

trainer.save_model('models/')
model.save_pretrained('models/' + run_name, safe_serialization=False)
tokenizer.save_pretrained('models/' + run_name, safe_serialization=False)

print("Model and tokenizer saved as: " + run_name)

<h5> Published model use: <h5>

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import pandas as pd
from datasets import Dataset


model_name = "VitaRin/ChemBERTaNLRP3"

pipeline = TextClassificationPipeline(
    model=AutoModelForSequenceClassification.from_pretrained(model_name),
    tokenizer=AutoTokenizer.from_pretrained(model_name),
    device=0
)

test_df = pd.read_csv("mols2.csv")
test_dataset = Dataset.from_pandas(test_df)
molecules = list(test_dataset["text"])

result = pipeline(molecules)

print(result)

<h5> Mean calculator:<h5>

1. For calculating the means of values with standard deviation:

In [None]:
choice = int(input("How many numbers to take an average from?:\n"))
numbers = []

for i in range (choice):
    number = float(input("Enter a number:\n"))
    numbers.append(number)

# Calculate the mean
mean = sum(numbers) / len(numbers)

# Calculate the sum of squares of differences
sum_of_squares = sum((x - mean) ** 2 for x in numbers)

# Calculate the variance and standard deviation
variance = sum_of_squares / len(numbers)
std_deviation = variance ** 0.5

print("Average of", numbers, ":\n")
print("Average:", round(mean, 4))
print("Standard Deviation:", round(std_deviation, 4))

2. For calculating the average mean and standard deviation between mean values:

In [None]:
import math

# Function to calculate mean of means and combined standard deviation
def mean_of_means(means, std_devs):
    # Calculate the combined mean and standard deviation
    combined_mean = sum(means) / len(means)
    combined_variance = sum((std_dev ** 2) for std_dev in std_devs) / len(means)
    combined_std_dev = math.sqrt(combined_variance)

    return combined_mean, combined_std_dev

# Input number of means
num_means = int(input("How many means to combine?:\n"))
means, std_devs = [], []

for i in range(num_means):
    mean = float(input(f"Enter mean value {i+1}:\n"))
    std_dev = float(input(f"Enter standard deviation for mean value {i+1}:\n"))
    means.append(mean)
    std_devs.append(std_dev)

combined_mean, combined_std_dev = mean_of_means(means, std_devs)

print("Combined Mean Average:", round(combined_mean, 4))
print("Combined Standard Deviation:", round(combined_std_dev, 4))