In [11]:
pip install transformers datasets requests pandas




In [5]:
import requests
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch

# URL to the IOB dataset
url = 'https://raw.githubusercontent.com/Erechtheus/mutationCorpora/master/corpora/IOB/SETH-train.iob'

# Download the dataset
response = requests.get(url)
data = response.text.split('\n')

# Parse the data
sentences = []
labels = []
sentence = []
label = []

for line in data:
    line = line.strip()  # Remove leading/trailing spaces
    if not line:
        continue
    if line.startswith('#'):
        if sentence:
            sentences.append(sentence)
            labels.append(label)
            sentence = []
            label = []
    else:
        parts = line.split(',')
        if len(parts) == 2:
            token, tag = parts
            token, tag = token.strip(), tag.strip()  # Ensure no leading/trailing spaces
            sentence.append(token)
            label.append(tag)

# Append the last sentence if present
if sentence:
    sentences.append(sentence)
    labels.append(label)

# Convert to a DataFrame
df = pd.DataFrame({'sentence': sentences, 'labels': labels})

# Verify unique labels before cleaning
unique_labels = list(set(label for sublist in labels for label in sublist))

# Define the expected labels
expected_labels = {'O', 'B-Gene', 'I-SNP', 'I-Gene', 'B-SNP', 'B-RS'}

# Function to clean labels
def clean_labels(label_list):
    return [label if label in expected_labels else 'O' for label in label_list]

# Clean the labels
cleaned_labels = [clean_labels(label_list) for label_list in labels]

# Update the DataFrame with cleaned labels
df['labels'] = cleaned_labels

# Verify the cleaned labels
cleaned_unique_labels = list(set(label for sublist in cleaned_labels for label in sublist))

# Define the tokenizer and model checkpoint
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split into train and validation datasets
dataset = dataset.train_test_split(test_size=0.1)

# Function to tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["sentence"], truncation=True, padding=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(cleaned_unique_labels.index(label[word_idx]))
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize the datasets
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=["sentence", "labels"])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,  # Log every 10 steps
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Data collator to handle dynamic padding
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(cleaned_unique_labels))
model.to(device)  # Move model to the device

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Function to predict labels for a sentence
def predict(sentence):
    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    inputs = inputs.to(device)  # Move inputs to the device

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs).logits

    # Get the predictions
    predictions = torch.argmax(outputs, dim=2).cpu().numpy()[0]

    # Convert token IDs back to tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].cpu().numpy()[0])

    # Map predictions to tokens for better interpretation
    predicted_labels = []
    for token, label in zip(tokens, predictions):
        predicted_labels.append((token, cleaned_unique_labels[label]))

    # Merge subword tokens and remove CLS/SEP tokens
    merged_labels = []
    current_word = ""
    current_label = None
    for token, label in predicted_labels:
        if token.startswith("##"):
            current_word += token[2:]
        else:
            if current_word:
                merged_labels.append((current_word, current_label))
            if token not in ["[CLS]", "[SEP]"]:
                current_word = token
                current_label = label
    if current_word:
        merged_labels.append((current_word, current_label))

    return merged_labels

# Example usage with additional sentences
sentences = [
    "The mutation occurs in different genes.",
    "Mutations in the BRAF gene are common.",
    "The V600E mutation is an example.",
    "The KRAS gene includes multiple SNPs.",
    "Mutations such as G12D are significant.",
    "The rs11614913 variant has been studied."
]

for sentence in sentences:
    print(predict(sentence))


Map:   0%|          | 0/459 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2158,0.144805
2,0.1167,0.102048
3,0.0795,0.086008
4,0.0707,0.085238
5,0.0571,0.085566


[('The', 'O'), ('mutation', 'O'), ('occurs', 'O'), ('in', 'O'), ('different', 'O'), ('genes', 'O'), ('.', 'O'), ('.', 'O')]
[('Mutations', 'O'), ('in', 'O'), ('the', 'O'), ('BRAF', 'B-Gene'), ('gene', 'O'), ('are', 'O'), ('common', 'O'), ('.', 'O'), ('.', 'O')]
[('The', 'O'), ('V600E', 'B-SNP'), ('mutation', 'O'), ('is', 'O'), ('an', 'O'), ('example', 'O'), ('.', 'O'), ('.', 'O')]
[('The', 'O'), ('KRAS', 'B-Gene'), ('gene', 'O'), ('includes', 'O'), ('multiple', 'O'), ('SNPs', 'O'), ('.', 'O'), ('.', 'O')]
[('Mutations', 'O'), ('such', 'O'), ('as', 'O'), ('G12D', 'B-SNP'), ('are', 'O'), ('significant', 'O'), ('.', 'O'), ('.', 'O')]
[('The', 'O'), ('rs11614913', 'O'), ('variant', 'O'), ('has', 'O'), ('been', 'O'), ('studied', 'O'), ('.', 'O'), ('.', 'O')]


In [14]:
# Example usage of the prediction function with the above sentences

sentences = [
    "Mutations such as C to G mutation are significant.",
    "The rs11614913 variant, MCAD and medium chain acyl-CoA dehydrogenase gene have been studied.",
    "The TP53 gene is known for its role in cancer."
]

for sentence in sentences:
    print(predict(sentence))


[('Mutations', 'O'), ('such', 'O'), ('as', 'O'), ('C', 'O'), ('to', 'O'), ('G', 'I-SNP'), ('mutation', 'O'), ('are', 'O'), ('significant', 'O'), ('.', 'O'), ('.', 'O')]
[('The', 'O'), ('rs11614913', 'O'), ('variant', 'O'), (',', 'O'), ('MCAD', 'B-Gene'), ('and', 'O'), ('medium', 'O'), ('chain', 'O'), ('acyl', 'I-Gene'), ('-', 'I-Gene'), ('CoA', 'I-Gene'), ('dehydrogenase', 'O'), ('gene', 'O'), ('have', 'O'), ('been', 'O'), ('studied', 'O'), ('.', 'O'), ('.', 'O')]
[('The', 'O'), ('TP53', 'B-Gene'), ('gene', 'O'), ('is', 'O'), ('known', 'O'), ('for', 'O'), ('its', 'O'), ('role', 'O'), ('in', 'O'), ('cancer', 'O'), ('.', 'O'), ('.', 'O')]


In [12]:
!pip install transformers[torch] accelerate -U




In [18]:
!pip uninstall torch -y


Found existing installation: torch 2.3.0+cu121
Uninstalling torch-2.3.0+cu121:
  Successfully uninstalled torch-2.3.0+cu121


In [19]:
!pip install torch torchvision torchaudio


Collecting torch
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m939.7 kB/s[0m eta [36m0:00:00[0m
Collecting triton==2.3.1 (from torch)
  Downloading triton-2.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (168.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.1/168.1 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting torch
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch
Successfully installed torch-2.3.0
