## NLP Pipeline for Clinical Notes Analysis using ClinicalBERT

NLP pipeline for processing clinical notes using ClinicalBERT, Stanza biomedical models, and spaCy.  Future Work: Incorporate a different clinical notes dataset to further explore sepsis-related topics, leveraging the existing NLP pipeline.

In [None]:
!pip install stanza transformers torch

import pandas as pd
import os
import re
import stanza
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Download Stanza Biomedical models
stanza.download('en', package='mimic', processors={'ner': 'bc5cdr'})

# Check GPU availability
if torch.cuda.is_available():
    print("GPU is available and will be used for processing.")
else:
    print("GPU is not available. The CPU will be used for processing.")

# Load Stanza pipeline for English Biomedical models
stanza_nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'bc5cdr'})

# Load ClinicalBERT model and tokenizer for token classification (NER)
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Create a NER pipeline using ClinicalBERT
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)


In [None]:
directory = os.getenv('CLINICAL_REFS_DIR', "./clinical_refs")

if not os.path.exists(directory):
    os.makedirs(directory)

# File paths
train_file_path = os.path.join(directory, 'MTS-Dialog-TrainingSet.csv')
validation_file_path = os.path.join(directory, 'MTS-Dialog-ValidationSet.csv')

# Load the datasets
train_df = pd.read_csv(train_file_path)
validation_df = pd.read_csv(validation_file_path)

# Explicitly retrieve headers
train_headers = train_df.columns.tolist()
validation_headers = validation_df.columns.tolist()

# Print the column headers to confirm the available columns
print("Training dataset columns:", train_headers)
print("Validation dataset columns:", validation_headers)

# Display the first few rows of each dataset
print("Training dataset preview:")
print(train_df.head())

print("Validation dataset preview:")
print(validation_df.head())


### Preprocess clinician notes

In [None]:
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I | re.A).lower()
    # Tokenize the text
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

# Ensure the 'section_text' column exists in your datasets
assert 'section_text' in train_df.columns, "The training dataset does not have a 'section_text' column."
assert 'section_text' in validation_df.columns, "The validation dataset does not have a 'section_text' column."

train_df['section_text_clean'] = train_df['section_text'].apply(preprocess_text)
validation_df['section_text_clean'] = validation_df['section_text'].apply(preprocess_text)


print("Training dataset preview after preprocessing:")
print(train_df[['section_text', 'section_text_clean']].head())
print("Validation dataset preview after preprocessing:")
print(validation_df[['section_text', 'section_text_clean']].head())


###  Named Entity Recognition (NER) 

In [None]:
# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to perform NER using ClinicalBERT with text splitting
def perform_ner_clinicalbert(text, max_length=512):
    tokens = tokenizer(text, return_tensors='pt', truncation=True, max_length=max_length, padding=True)
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    
    # Check if input_ids and attention_mask are on the same device as the model
    if input_ids.device != device:
        input_ids = input_ids.to(device)
    if attention_mask.device != device:
        attention_mask = attention_mask.to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits

    predictions = torch.argmax(logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())
    entities = []
    for token, prediction in zip(tokens, predictions[0].cpu().numpy()):
        if prediction != 0:  # 0 corresponds to the 'O' label in BIO tagging
            entities.append((token, model.config.id2label[prediction.item()]))
    
    return entities

# Function to perform NER using Stanza
def perform_stanza_ner(text):
    doc = stanza_nlp(text)
    entities = [(ent.text, ent.type) for ent in doc.ents]
    return entities

# Apply NER to the training dataset using ClinicalBERT
print("Applying ClinicalBERT NER to training dataset...")
train_df['clinical_bert_entities'] = train_df['section_text_clean'].apply(perform_ner_clinicalbert)

# Apply NER to the validation dataset using ClinicalBERT
print("Applying ClinicalBERT NER to validation dataset...")
validation_df['clinical_bert_entities'] = validation_df['section_text_clean'].apply(perform_ner_clinicalbert)

# Apply NER to the training dataset using Stanza
print("Applying Stanza NER to training dataset...")
train_df['stanza_entities'] = train_df['section_text_clean'].apply(perform_stanza_ner)

# Apply NER to the validation dataset using Stanza
print("Applying Stanza NER to validation dataset...")
validation_df['stanza_entities'] = validation_df['section_text_clean'].apply(perform_stanza_ner)

# Display the results
print("Training dataset with NER results (ClinicalBERT and Stanza):")
print(train_df[['section_text_clean', 'clinical_bert_entities', 'stanza_entities']].head())

print("Validation dataset with NER results (ClinicalBERT and Stanza):")
print(validation_df[['section_text_clean', 'clinical_bert_entities', 'stanza_entities']].head())


### Sentiment Analysis

Note: Sentiment analysis was considered but is not included in the final analysis.
Clinical notes tend to have an overwhelmingly negative sentiment due to the nature of the content.
Therefore, sentiment analysis does not provide meaningful insights in this context.

### Relation Extraction using spaCy, Stanza and ClinicalBERT.

Bio_ClinicalBERT, as the name suggests, is a variant that combines the strengths of both BioBERT and ClinicalBERT. It is designed to handle a broader range of biomedical and clinical texts.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load ClinicalBERT model and tokenizer
model_name = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Move the model to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

def truncate_text(text, tokenizer, max_length=512):
    # Use tokenizer to encode the text and ensure truncation
    inputs = tokenizer(text, max_length=max_length, truncation=True, return_tensors='pt')
    # Decode back to string to verify truncation
    truncated_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
    return truncated_text

def perform_relation_extraction_clinicalbert(text, max_length=512):
    # Truncate text to the maximum length
    truncated_text = truncate_text(text, tokenizer, max_length)
    
    # Tokenize and encode the text
    tokens = tokenizer(truncated_text, return_tensors='pt', truncation=True, padding=True)
    
    # Move input tensors to the correct device (GPU or CPU)
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)
    
    # Perform relation extraction (assuming the model outputs relations)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits

    # Process logits to extract entities and relationships
    predictions = torch.argmax(logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    entities = []
    for token, prediction in zip(tokens, predictions[0]):
        if prediction != 0:  # 0 corresponds to the 'O' label in BIO tagging
            entities.append((token, model.config.id2label[prediction.item()]))
    
    # Dummy relationships extraction logic (replace with actual processing logic)
    relationships = [(entities[i][0], 'related_to', entities[i+1][0]) for i in range(len(entities)-1)]
    
    return relationships

# Apply relation extraction to the training dataset using ClinicalBERT
print("Applying ClinicalBERT relation extraction to training dataset...")
train_df['clinicalbert_relations'] = train_df['section_text_clean'].apply(perform_relation_extraction_clinicalbert)

# Apply relation extraction to the validation dataset using ClinicalBERT
print("Applying ClinicalBERT relation extraction to validation dataset...")
validation_df['clinicalbert_relations'] = validation_df['section_text_clean'].apply(perform_relation_extraction_clinicalbert)

# Display the first few rows of each dataset after relation extraction
print("Training dataset after ClinicalBERT relation extraction:")
print(train_df[['section_text_clean', 'clinicalbert_relations']].head())

print("Validation dataset after ClinicalBERT relation extraction:")
print(validation_df[['section_text_clean', 'clinicalbert_relations']].head())


In [None]:
# Assuming Stanza is already set up and stanza_nlp is loaded
def extract_stanza_relations(text):
    doc = stanza_nlp(text)
    relations = []
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.head != 0:  # If the word has a head (root has head 0)
                head_word = sentence.words[word.head - 1]
                relations.append((word.text, word.deprel, head_word.text))
    return relations

# Apply relation extraction to the training dataset using Stanza
train_df['stanza_relations'] = train_df['section_text_clean'].apply(extract_stanza_relations)
print("Relation extraction performed on the training dataset using Stanza.")

# Apply relation extraction to the validation dataset using Stanza
validation_df['stanza_relations'] = validation_df['section_text_clean'].apply(extract_stanza_relations)
print("Relation extraction performed on the validation dataset using Stanza.")

# Display the first few rows of each dataset after relation extraction
print("Training dataset after relation extraction (Stanza):")
print(train_df[['section_text_clean', 'stanza_relations']].head())

print("Validation dataset after relation extraction (Stanza):")
print(validation_df[['section_text_clean', 'stanza_relations']].head())


In [None]:
import spacy
from spacy.matcher import Matcher

# Load the spaCy model
try:
    nlp_spacy = spacy.load('en_core_web_sm')
except OSError:
    from spacy.cli import download
    download('en_core_web_sm')
    nlp_spacy = spacy.load('en_core_web_sm')

def extract_spacy_relations(doc):
    # Define the pattern for relation extraction
    patterns = [
        [{'DEP': 'nsubj'}, {'DEP': 'ROOT'}, {'DEP': 'dobj'}],  # Subject-Verb-Object
        [{'DEP': 'nsubj'}, {'DEP': 'prep'}, {'DEP': 'pobj'}],  # Subject-Preposition-Object
        [{'DEP': 'nsubjpass'}, {'DEP': 'aux'}, {'DEP': 'prep'}, {'DEP': 'pobj'}]  # Passive Subject-Preposition-Object
    ]
    
    # Initialize the matcher with the patterns
    matcher = Matcher(nlp_spacy.vocab)
    for i, pattern in enumerate(patterns):
        matcher.add(f'relation_pattern_{i}', [pattern])
    
    matches = matcher(doc)
    relations = []
    
    for match_id, start, end in matches:
        span = doc[start:end]
        relations.append((span.text, span.root.head.text))
    
    return relations

# Apply relation extraction to the training dataset
train_df['spacy_relations'] = train_df['section_text_clean'].apply(lambda x: extract_spacy_relations(nlp_spacy(x)))
print("Enhanced relation extraction performed on the training dataset using spaCy.")

# Apply relation extraction to the validation dataset
validation_df['spacy_relations'] = validation_df['section_text_clean'].apply(lambda x: extract_spacy_relations(nlp_spacy(x)))
print("Enhanced relation extraction performed on the validation dataset using spaCy.")

print("Training dataset after enhanced relation extraction (spaCy):")
print(train_df[['section_text_clean', 'spacy_relations']].head())
print("Validation dataset after enhanced relation extraction (spaCy):")
print(validation_df[['section_text_clean', 'spacy_relations']].head())


### Topic Modeling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Define the number of topics
num_topics = 5

def perform_topic_modeling(data):
    # Vectorize the text data
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    X = vectorizer.fit_transform(data['section_text_clean'])

    # Apply LDA for topic modeling
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(X)

    def display_topics(model, feature_names, num_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print(f"Topic {topic_idx}:")
            print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

    num_top_words = 10
    display_topics(lda, vectorizer.get_feature_names_out(), num_top_words)

print("Training dataset topics:")
perform_topic_modeling(train_df)

print("Validation dataset topics:")
perform_topic_modeling(validation_df)


In [None]:
!pip install gensim

import nltk
nltk.download('punkt')

import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def prepare_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

def perform_topic_modeling_gensim(df):
    # Tokenize the text
    df['tokens'] = df['section_text_clean'].apply(prepare_text)

    # Create Dictionary and Corpus needed for Topic Modeling
    id2word = corpora.Dictionary(df['tokens'])
    texts = df['tokens']
    corpus = [id2word.doc2bow(text) for text in texts]

    # Build the LDA model
    lda_model = LdaModel(
        corpus=corpus, 
        id2word=id2word, 
        num_topics=5, 
        random_state=42, 
        update_every=1, 
        chunksize=100, 
        passes=10, 
        alpha='auto', 
        per_word_topics=True
    )

    # Extract the topics
    topics = lda_model.print_topics(num_words=5)

    for topic in topics:
        print(topic)


print("Training dataset topics:")
perform_topic_modeling_gensim(train_df)
print("Validation dataset topics:")
perform_topic_modeling_gensim(validation_df)



In [None]:

# sepsis

!pip install gensim

from gensim.corpora import Dictionary
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Filter Documents for Sepsis-Related Content
def filter_sepsis_documents(df):
    sepsis_keywords = ['sepsis', 'septic', 'infection', 'bacteria', 'septicemia']
    sepsis_pattern = re.compile(r'\b(?:' + '|'.join(sepsis_keywords) + r')\b', re.IGNORECASE)
    return df[df['section_text_clean'].apply(lambda text: bool(sepsis_pattern.search(text)))]

def prepare_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

def perform_topic_modeling_gensim(df, num_topics=5, num_top_words=10):
    # Tokenize the text
    df['tokens'] = df['section_text_clean'].apply(prepare_text)

    # Create Dictionary and Corpus needed for Topic Modeling
    id2word = Dictionary(df['tokens'])
    texts = df['tokens']
    corpus = [id2word.doc2bow(text) for text in texts]

    # Build the LDA model
    lda_model = LdaModel(
        corpus=corpus, 
        id2word=id2word, 
        num_topics=num_topics, 
        random_state=42, 
        update_every=1, 
        chunksize=100, 
        passes=10, 
        alpha='auto', 
        per_word_topics=True
    )

    topics = lda_model.print_topics(num_words=num_top_words)

    for topic in topics:
        print(topic)

print("Training dataset sepsis-related topics:")
filtered_train_df = filter_sepsis_documents(train_df)
perform_topic_modeling_gensim(filtered_train_df)

print("Validation dataset sepsis-related topics:")
filtered_validation_df = filter_sepsis_documents(validation_df)
perform_topic_modeling_gensim(filtered_validation_df)
