In [6]:
import urllib.request
import gzip
import xml.etree.ElementTree as ET
import re
from nltk.tokenize import TreebankWordTokenizer

# Download a single PubMed baseline file
url = 'https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0001.xml.gz'
local_file = '/content/pubmed25n0001.xml.gz'

try:
    print(f"Downloading {url}...")
    urllib.request.urlretrieve(url, local_file)
except Exception as e:
    print(f"Error downloading file: {e}")
    raise

# Extract and preprocess abstracts
def extract_abstracts(xml_gz_file):
    abstracts = []
    tokenizer = TreebankWordTokenizer()
    with gzip.open(xml_gz_file, 'rt', encoding='utf-8') as f:
        tree = ET.parse(f)
        root = tree.getroot()
        for article in root.findall('.//Article'):
            abstract = article.find('.//Abstract/AbstractText')
            if abstract is not None and abstract.text:
                text = abstract.text.lower()
                text = re.sub(r'[\r\n]+', ' ', text)
                text = re.sub(r'[^\x00-\x7F]+', ' ', text)
                tokenized = tokenizer.tokenize(text)
                text = ' '.join(tokenized)
                text = re.sub(r"\s's\b", "'s", text)
                abstracts.append(text)
    return abstracts

abstracts = extract_abstracts(local_file)
print(f"Extracted {len(abstracts)} abstracts")

# Save abstracts for inspection
with open('/content/pubmed_abstracts.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(abstracts))

# Create a dataset
from datasets import Dataset
dataset = Dataset.from_dict({'text': abstracts})
print(dataset)
print(dataset[0]['text'] if len(dataset) > 0 else 'No abstracts')

Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0001.xml.gz...
Extracted 15377 abstracts
Dataset({
    features: ['text'],
    num_rows: 15377
})
( -- ) -alpha-bisabolol has a primary antipeptic action depending on dosage , which is not caused by an alteration of the ph-value. the proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. the antipeptic action of bisabolol only occurs in case of direct contact. in case of a previous contact with the substrate , the inhibiting effect is lost .


In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Step 1: Load and preprocess dataset
def load_dataset(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            texts = [line.strip() for line in f if line.strip()]
        if not texts:
            raise ValueError("Dataset is empty or contains only empty lines")
        logger.info(f"Loaded {len(texts)} abstracts from {file_path}")
        return Dataset.from_dict({"text": texts})
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise

# Step 2: Tokenize dataset
def tokenize_dataset(dataset, tokenizer):
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")
    try:
        tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
        logger.info("Dataset tokenized successfully")
        return tokenized_dataset
    except Exception as e:
        logger.error(f"Error tokenizing dataset: {e}")
        raise

# Step 3: Fine-tune BERT
def finetune_bert():
    try:
        # Load tokenizer and model
        logger.info("Loading bert-base-uncased...")
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForMaskedLM.from_pretrained('bert-base-uncased')

        # Load dataset
        dataset = load_dataset('/content/pubmed_abstracts.txt')

        # Tokenize dataset
        tokenized_dataset = tokenize_dataset(dataset, tokenizer)

        # Data collator for MLM
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=True,
            mlm_probability=0.15
        )

        # Training arguments
        training_args = TrainingArguments(
            output_dir='/content/finetune/fine_tuned_bert',
            overwrite_output_dir=True,
            num_train_epochs=3,
            per_device_train_batch_size=1,  # Reduced for memory efficiency
            save_steps=500,
            save_total_limit=2,
            logging_steps=100,
            learning_rate=2e-5,
            max_grad_norm=1.0,
            fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
        )

        # Initialize trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=tokenized_dataset,
        )

        # Train
        logger.info("Starting fine-tuning...")
        trainer.train()

        # Save model
        model.save_pretrained('/content/finetune/fine_tuned_bert')
        tokenizer.save_pretrained('/content/finetune/fine_tuned_bert')
        logger.info("Fine-tuned model saved to ./fine_tuned_bert")
    except Exception as e:
        logger.error(f"Error during fine-tuning: {e}")
        raise

if __name__ == "__main__":
    finetune_bert()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Map:   0%|          | 0/15377 [00:00<?, ? examples/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m2023ad05044[0m ([33m2023ad05044-bits-pilani[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,2.19
200,2.0764
300,1.9936
400,2.113
500,2.0286
600,2.3355
700,2.1031
800,2.037
900,2.0792
1000,2.0571
