In [2]:
# Install necessary libraries
!pip install datasets nltk scikit-learn transformers torch spacy scispacy




In [3]:
# Import libraries
import nltk
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from huggingface_hub import login


In [5]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
!pip install --upgrade datasets

from datasets import load_dataset

dataset = load_dataset("javicorvi/pretoxtm-dataset")



In [7]:
label_mapping = {"PRETOX_REL": 1, "NO_PRETOX_REL": 0}  # Adjust based on dataset labels

def convert_labels(example):
    example["label"] = label_mapping[example["label"]]
    return example

dataset = dataset.map(convert_labels)

# Keep Only 'text' and 'label' Columns
dataset = dataset.remove_columns([col for col in dataset["train"].column_names if col not in ["text", "label"]])

Map:   0%|          | 0/2053 [00:00<?, ? examples/s]

Map:   0%|          | 0/440 [00:00<?, ? examples/s]

Map:   0%|          | 0/440 [00:00<?, ? examples/s]

In [8]:
df = pd.DataFrame(dataset['train'])

In [9]:
df.head(5)

Unnamed: 0,label,text
0,1,"At 3 mg / kg , a pronounced decrease of termin..."
1,1,The main finding was atrophy of epithelium in ...
2,0,_ The evaluation of blood pressure and of hear...
3,0,No relevant changes were found in females .
4,1,Adverse treatment - related effects were limit...


In [10]:
df.shape

(2053, 2)

In [11]:
# Load transformer model and tokenizer (BioBERT)
model_name = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Assuming binary classification


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Preprocessing function
nltk.download('punkt_tab')
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())  # Tokenization
    return ' '.join(tokens)

df['processed_text'] = df['text'].apply(preprocess_text)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [13]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['label'], test_size=0.2, random_state=42)


In [14]:
print("X_train:")
print(X_train)
print("X_test:")
print(X_test)
print("y_train:")
print(y_train)
print("y_test:")
print(y_test)


X_train:
694     the focal increased cytolysosomes in skeletal ...
1222    mean t max values were reached at 0.083 h afte...
1578    decreases in body weight parameters were parti...
618     ophthalmoscopic examinations were performed on...
1452    at necropsy , mean relative liver to body weig...
                              ...                        
1638    there was generally a dose - proportional incr...
1095    additionally , only at 5 mg compound_xxx / kg ...
1130    it is concluded that dietary administration of...
1294    this was associated with abnormal gait , mis -...
860     there was no clear correlation between the occ...
Name: processed_text, Length: 1642, dtype: object
X_test:
464     clinical signs , body weight and food consumpt...
1052    treatment - related microscopic alterations we...
1315    at the end of the 4 - week recovery period lym...
710     only urinary bladder alterations were observed...
544     the auc ( 0 - 24 h ) values for female nos . 2...
     

In [15]:
# Tokenization
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")


In [16]:
print(train_encodings)
print(test_encodings)


{'input_ids': tensor([[  101,  1103, 17811,  ...,     0,     0,     0],
        [  101,  1928,   189,  ...,     0,     0,     0],
        [  101, 19377,  1107,  ...,     0,     0,     0],
        ...,
        [  101,  1122,  1110,  ...,     0,     0,     0],
        [  101,  1142,  1108,  ...,     0,     0,     0],
        [  101,  1175,  1108,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
{'input_ids': tensor([[ 101, 7300, 5300,  ...,    0,    0,    0],
        [ 101, 3252,  118,  ...,    0,    0,    0],
        [ 101, 1120, 1103,  ...

In [17]:
# Convert to tensors
train_labels = torch.tensor(y_train.tolist())
test_labels = torch.tensor(y_test.tolist())


In [18]:
# Convert data into PyTorch DataLoader for training and evaluation
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)


In [19]:
# Set up optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


In [None]:
import torch
from torch.optim.lr_scheduler import StepLR
import time
from sklearn.metrics import accuracy_score, classification_report

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Use a learning rate scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)

# Enable mixed precision training if available
scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None

# Fine-tuning loop with optimizations
epochs = 10
for epoch in range(epochs):
    start_time = time.time()
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        # Move batch to the correct device
        batch_input_ids, batch_attention_mask, batch_labels = [x.to(device) for x in batch]
        optimizer.zero_grad()

        # Use autocast for mixed precision if available
        with torch.cuda.amp.autocast() if scaler else torch.enable_grad():
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask,
                labels=batch_labels
            )
            loss = outputs.loss
            total_loss += loss.item()

        # Backward pass and optimizer step
        if loss.requires_grad:
            if scaler:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()
        else:
            print("Warning: loss does not require gradients.")

    scheduler.step()  # Adjust learning rate
    avg_loss = total_loss / len(train_dataloader)
    elapsed_time = time.time() - start_time

    # Evaluation phase
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            batch_input_ids, batch_attention_mask, batch_labels = [x.to(device) for x in batch]
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).tolist())
            true_labels.extend(batch_labels.tolist())

    accuracy = accuracy_score(true_labels, predictions)
    print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f} - Accuracy: {accuracy:.4f} - Time: {elapsed_time:.2f}s")
    print("\nClassification Report:\n", classification_report(true_labels, predictions))


  scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None
  with torch.cuda.amp.autocast() if scaler else torch.enable_grad():


Epoch 1 - Loss: 0.2322 - Accuracy: 0.9513 - Time: 42.73s

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96       236
           1       0.95      0.94      0.94       175

    accuracy                           0.95       411
   macro avg       0.95      0.95      0.95       411
weighted avg       0.95      0.95      0.95       411



  with torch.cuda.amp.autocast() if scaler else torch.enable_grad():


Epoch 2 - Loss: 0.0744 - Accuracy: 0.9416 - Time: 42.80s

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.93      0.95       236
           1       0.91      0.95      0.93       175

    accuracy                           0.94       411
   macro avg       0.94      0.94      0.94       411
weighted avg       0.94      0.94      0.94       411



  with torch.cuda.amp.autocast() if scaler else torch.enable_grad():


Epoch 3 - Loss: 0.0244 - Accuracy: 0.9635 - Time: 43.80s

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       236
           1       0.94      0.97      0.96       175

    accuracy                           0.96       411
   macro avg       0.96      0.96      0.96       411
weighted avg       0.96      0.96      0.96       411



  with torch.cuda.amp.autocast() if scaler else torch.enable_grad():


Epoch 4 - Loss: 0.0213 - Accuracy: 0.9562 - Time: 42.74s

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.95      0.96       236
           1       0.93      0.97      0.95       175

    accuracy                           0.96       411
   macro avg       0.95      0.96      0.96       411
weighted avg       0.96      0.96      0.96       411



  with torch.cuda.amp.autocast() if scaler else torch.enable_grad():


Epoch 5 - Loss: 0.0077 - Accuracy: 0.9489 - Time: 42.43s

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96       236
           1       0.94      0.94      0.94       175

    accuracy                           0.95       411
   macro avg       0.95      0.95      0.95       411
weighted avg       0.95      0.95      0.95       411



  with torch.cuda.amp.autocast() if scaler else torch.enable_grad():


In [None]:
# Cell to add in your notebook
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


In [None]:
# Cell to add in your notebook
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [None]:
# Cell to add in your notebook
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_f1_score

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_f1_score(p.label_ids, preds, average='binary')
    acc = accuracy_score(p.label_ids, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # Using 1 epoch for a quick result, you can increase this
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()


In [None]:
# Cell to add in your notebook
# IMPORTANT: Update this path to point to your local project folder
model_save_path = './tox-screener-project/backend/model'

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model successfully saved to {model_save_path}")
