# NLP Pipeline using Clinician Notes

This Jupyter notebook provides an NLP pipeline for exploring clinician notes, leveraging Natural Language Processing (NLP) techniques to analyze and cluster clinical notes. It utilizes BioBERT, which is pre-trained on large-scale biomedical corpora and can generate context-aware embeddings for clinical notes.While the primary function of this script is to explore NLP tools for clinician-patient notes, adapting this script to help identify relationships to sepsis could provide insights for clinical decision-making.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
import torch
from tqdm import tqdm
from transformers import pipeline

In [None]:
directory = os.getenv('CLINICAL_REFS_DIR', "./clinical_refs")

if not os.path.exists(directory):
    os.makedirs(directory)

train_file_path = os.path.join(directory, 'MTS-Dialog-TrainingSet.csv')
validation_file_path = os.path.join(directory, 'MTS-Dialog-ValidationSet.csv')

train_df = pd.read_csv(train_file_path)
validation_df = pd.read_csv(validation_file_path)

# retrieve headers
train_headers = train_df.columns.tolist()
validation_headers = validation_df.columns.tolist()

print("Training dataset columns:", train_headers)
print("Validation dataset columns:", validation_headers)
print("Training dataset preview:")
print(train_df.head())
print("Validation dataset preview:")
print(validation_df.head())

# spacy model with error handling
try:
    nlp = spacy.load("en_core_web_sm")
    print("SpaCy model loaded successfully.")
except Exception as e:
    print(f"Error loading SpaCy model: {e}")
    print("Attempting to download the model...")
    try:
        import spacy.cli
        spacy.cli.download("en_core_web_sm")
        nlp = spacy.load("en_core_web_sm")
        print("SpaCy model downloaded and loaded successfully.")
    except Exception as download_error:
        print(f"Error downloading or loading SpaCy model: {download_error}")

# Set random seed for reproducibility
np.random.seed(42)


###  Preprocessing 
Removing irrelevant data, Tokenizing  text, removing stop words,lemmatization

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import logging

logging.basicConfig(level=logging.ERROR)

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
default_stop_words = set(stopwords.words('english'))

# Add domain-specific stop words (extend this list as needed)
medical_stop_words = {'patient', 'doctor', 'mg', 'ml'}
stop_words = default_stop_words.union(medical_stop_words)

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """
    Preprocess the input text by:
    1. Lowercasing the text
    2. Removing non-alphanumeric characters
    3. Tokenizing the text
    4. Removing stop words
    5. Lemmatizing the words
    """
    try:
        text = text.lower()
        text = re.sub(r'\W', ' ', text)

        # Tokenize the text using Spacy
        doc = nlp(text)
        tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct and not token.is_space]
        processed_text = ' '.join(tokens)

        return processed_text

    except Exception as e:
        logging.error(f"Error during text preprocessing: {e}")
        return ""

train_df['processed_text'] = train_df['section_text'].apply(preprocess_text)
validation_df['processed_text'] = validation_df['section_text'].apply(preprocess_text)

print("Processed Training DataFrame:")
print(train_df[['ID', 'section_text', 'processed_text']].head())

print("Processed Validation DataFrame:")
print(validation_df[['ID', 'section_text', 'processed_text']].head())


### Feature Extraction using BioBert

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

# Initialize BioBERT model and tokenizer from Hugging Face
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def embed_text(text, tokenizer, model):
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        # Use the embeddings of the [CLS] token
        cls_embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
        return cls_embedding
    except Exception as e:
        logging.error(f"Error during text embedding: {e}")
        return np.zeros(model.config.hidden_size)

train_df['embedding'] = train_df['processed_text'].apply(lambda x: embed_text(x, tokenizer, model))
validation_df['embedding'] = validation_df['processed_text'].apply(lambda x: embed_text(x, tokenizer, model))

# Convert embeddings to a numpy array for clustering
X_train = np.vstack(train_df['embedding'].values)
X_validation = np.vstack(validation_df['embedding'].values)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_validation: {X_validation.shape}")


### Model Selection and Training

In [None]:
from sklearn.cluster import KMeans

def apply_kmeans_clustering(X_train, X_validation, num_clusters=2):
    """
    Apply K-Means clustering to the training and validation sets.
    """
    try:
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        kmeans.fit(X_train)

        train_clusters = kmeans.predict(X_train)
        validation_clusters = kmeans.predict(X_validation)

        logging.info("K-Means clustering completed successfully.")
        return train_clusters, validation_clusters, kmeans

    except Exception as e:
        logging.error(f"Error during K-Means clustering: {e}")
        raise

try:
    num_clusters = 3  # Adjust the number of clusters as needed
    train_clusters, validation_clusters, kmeans = apply_kmeans_clustering(X_train, X_validation, num_clusters)

    train_df['cluster'] = train_clusters
    validation_df['cluster'] = validation_clusters

    print("Training DataFrame with cluster labels:")
    print(train_df[['ID', 'section_text', 'cluster']].head())
    print("Validation DataFrame with cluster labels:")
    print(validation_df[['ID', 'section_text', 'cluster']].head())

except Exception as e:
    logging.error(f"Error during K-Means clustering setup: {e}")


In [None]:
from collections import Counter

def analyze_clusters(df, num_clusters):
    cluster_analysis = {}
    sepsis_keywords = ['sepsis', 'septic', 'infection', 'organ failure', 'blood culture', 'antibiotics']

    for cluster_id in range(num_clusters):
        cluster_texts = df[df['cluster'] == cluster_id]['processed_text'].tolist()
        word_counter = Counter(' '.join(cluster_texts).split())

        sepsis_keyword_count = {keyword: word_counter[keyword] for keyword in sepsis_keywords}
        total_words = sum(word_counter.values())

        relative_frequency = {keyword: count / total_words for keyword, count in sepsis_keyword_count.items()}

        cluster_analysis[cluster_id] = {
            'total_texts': len(cluster_texts),
            'sepsis_keyword_count': sepsis_keyword_count,
            'relative_frequency': relative_frequency
        }

    return cluster_analysis

num_clusters = 3
cluster_analysis = analyze_clusters(train_df, num_clusters)

for cluster_id, analysis in cluster_analysis.items():
    print(f"\nCluster {cluster_id} Analysis:")
    print(f"Total texts: {analysis['total_texts']}")
    print("Sepsis Keyword Count:", analysis['sepsis_keyword_count'])
    print("Relative Frequency of Sepsis Keywords:", analysis['relative_frequency'])


### Model Evaluation using Silhouette Score and Davies-Bouldin Index

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

def evaluate_clustering(X, clusters):
    try:
        silhouette_avg = silhouette_score(X, clusters)
        print(f"Silhouette Score: {silhouette_avg:.4f}")

        davies_bouldin_avg = davies_bouldin_score(X, clusters)
        print(f"Davies-Bouldin Index: {davies_bouldin_avg:.4f}")

        logging.info("Clustering evaluation metrics calculated successfully.")

    except Exception as e:
        logging.error(f"Error during clustering evaluation: {e}")
        raise

evaluate_clustering(X_train, train_df['cluster'])


### Model Visualization

In [None]:
from wordcloud import WordCloud
from sklearn.manifold import TSNE

# Define sepsis-related keywords grouped by categories
sepsis_keywords = {
    'symptoms': ['fever', 'tachycardia', 'hypotension', 'leukocytosis', 'respiratory distress'],
    'diagnosis': ['sepsis', 'septic', 'bacteremia', 'blood culture', 'inflammatory response', 'systemic'],
    'treatment': ['antibiotics', 'shock', 'organ failure', 'multi-organ failure']
}

def analyze_cluster_themes(df, num_clusters, sepsis_keywords):
    """
    Analyze the themes of each cluster based on the frequency of sepsis-related keywords.
    """
    cluster_themes = {}

    for cluster_id in range(num_clusters):
        cluster_texts = df[df['cluster'] == cluster_id]['processed_text'].tolist()
        word_counter = Counter(' '.join(cluster_texts).split())

        theme_counts = {theme: sum(word_counter[keyword] for keyword in keywords) for theme, keywords in sepsis_keywords.items()}
        cluster_themes[cluster_id] = theme_counts

    return cluster_themes

def plot_clusters_tsne_with_themes(X, clusters, num_clusters, cluster_themes):
    try:
        # Apply t-SNE for dimensionality reduction
        tsne = TSNE(n_components=2, perplexity=40, random_state=42)
        X_tsne = tsne.fit_transform(X)

        # Create a DataFrame for t-SNE results
        tsne_df = pd.DataFrame(X_tsne, columns=['tsne1', 'tsne2'])
        tsne_df['cluster'] = clusters

        dominant_themes = {cluster_id: max(themes, key=themes.get) for cluster_id, themes in cluster_themes.items()}

        plt.figure(figsize=(12, 10))
        for cluster_id in range(num_clusters):
            cluster_data = tsne_df[tsne_df['cluster'] == cluster_id]
            plt.scatter(cluster_data['tsne1'], cluster_data['tsne2'], label=f'Cluster {cluster_id} ({dominant_themes[cluster_id]})', alpha=0.6)
        
        plt.xlabel("t-SNE Dimension 1")
        plt.ylabel("t-SNE Dimension 2")
        plt.title("t-SNE Visualization of Clusters with Themes")
        plt.legend()
        plt.show()

        logging.info("t-SNE cluster plot with themes generated successfully.")

    except Exception as e:
        logging.error(f"Error during t-SNE plotting with themes: {e}")
        raise

num_clusters = 3
cluster_themes = analyze_cluster_themes(train_df, num_clusters, sepsis_keywords)

for cluster_id, themes in cluster_themes.items():
    print(f"\nCluster {cluster_id} Theme Analysis:")
    print(themes)

plot_clusters_tsne_with_themes(X_train, train_df['cluster'], num_clusters, cluster_themes)
