# NLP Analysis of Clinical Notes for Sepsis Detection and Insights

This notebook utilizes PyTorch, NLTK, spaCy to perform Named Entity Recognition (NER), to analyze sentiment, to conduct topic modeling and to extract relationships within a clinical notes dataset, with sepsis as the central diagnosis.

## Installation and Imports

In [None]:
# Installations and Imports

!pip install -q spacy transformers

import spacy
import torch

!python -m spacy download en_core_web_sm

# Check GPU availability
if torch.cuda.is_available():
    print("GPU is available and will be used for processing.")
else:
    print("GPU is not available. The CPU will be used for processing.")


## Load datasets

In [None]:
import os
import pandas as pd

# Set up the directory path
directory = os.getenv('CLINICAL_REFS_DIR', "./clinical_refs")

if not os.path.exists(directory):
    os.makedirs(directory)

# File paths
train_file_path = os.path.join(directory, 'MTS-Dialog-TrainingSet.csv')
validation_file_path = os.path.join(directory, 'MTS-Dialog-ValidationSet.csv')

# Load the datasets
train_df = pd.read_csv(train_file_path)
validation_df = pd.read_csv(validation_file_path)

# Explicitly retrieve headers
train_headers = train_df.columns.tolist()
validation_headers = validation_df.columns.tolist()

# Print the column headers to confirm the available columns
print("Training dataset columns:", train_headers)
print("Validation dataset columns:", validation_headers)

# Display the first few rows of each dataset
print("Training dataset preview:")
print(train_df.head())

print("Validation dataset preview:")
print(validation_df.head())


## Preprocess text and combine data:

In [7]:
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

def preprocess_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I | re.A).lower()
    # Tokenize the text
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Adjust display settings
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

# Set up the directory path
directory = os.getenv('CLINICAL_REFS_DIR', "./clinical_refs")

if not os.path.exists(directory):
    os.makedirs(directory)

# File paths
train_file_path = os.path.join(directory, 'MTS-Dialog-TrainingSet.csv')
validation_file_path = os.path.join(directory, 'MTS-Dialog-ValidationSet.csv')

# Process each CSV file in the directory
dfs = []
try:
    files = [train_file_path, validation_file_path]
    for file_path in files:
        print(f"Reading file: {file_path}")
        df = pd.read_csv(file_path)
        print(f"Columns in {file_path}: {df.columns.tolist()}")
        # Check if 'section_text' column exists
        if 'section_text' in df.columns:
            df['section_text_clean'] = df['section_text'].apply(preprocess_text)
            dfs.append(df)
            print(f"Processed file: {file_path}")
        else:
            print(f"Column 'section_text' not found in file: {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")

if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
    print("Combined dataframe created successfully.")
    # Display the first few rows of the specified columns
    print(combined_df[['section_text', 'section_text_clean']].head())
else:
    combined_df = None
    print("No CSV files were processed.")


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Reading file: ./clinical_refs/MTS-Dialog-TrainingSet.csv
Columns in ./clinical_refs/MTS-Dialog-TrainingSet.csv: ['ID', 'section_header', 'section_text', 'dialogue']
Processed file: ./clinical_refs/MTS-Dialog-TrainingSet.csv
Reading file: ./clinical_refs/MTS-Dialog-ValidationSet.csv
Columns in ./clinical_refs/MTS-Dialog-ValidationSet.csv: ['ID', 'section_header', 'section_text', 'dialogue']
Processed file: ./clinical_refs/MTS-Dialog-ValidationSet.csv
Combined dataframe created successfully.
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

##  Named Entity Recognition (NER) 

In [8]:
# Ensure combined_df is not None and has been properly created
if combined_df is not None and not combined_df.empty:
    import spacy
    from spacy.cli import download

    # Download the spaCy model if it's not already downloaded
    try:
        nlp = spacy.load('en_core_web_sm')
    except OSError:
        download('en_core_web_sm')
        nlp = spacy.load('en_core_web_sm')

    def perform_ner(text):
        if pd.isna(text):
            return []
        doc = nlp(text)
        return [(ent.text, ent.label_) for ent in doc.ents]

    # Iterate through rows to debug
    entities = []
    for idx, row in combined_df.iterrows():
        try:
            entities.append(perform_ner(row['section_text_clean']))
        except Exception as e:
            print(f"Error processing row {idx}: {e}, text: {row['section_text_clean']}")

    # Assign the entities to the dataframe
    combined_df['entities'] = entities

    # Display the first few rows with entities
    print(combined_df[['section_text_clean', 'entities']].head())
else:
    print("combined_df is None or empty.")


                                                                                                                                                                                                                                                                                                                                                                                                                                                            section_text_clean  \
0                                                                                                                                                                patient yearold white female presents clinic today originally hypertension med check history hypertension osteoarthritis osteoporosis hypothyroidism allergic rhinitis kidney stones since last visit followed dr kumar issues stable fever chills cough congestion nausea vomiting chest pain chest pressure   
1  patient yearold righthanded caucasian female presented emergency 

## Sentiment Analysis

In [12]:
from transformers import pipeline, AutoTokenizer

# Load the sentiment analysis pipeline with a specified model
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
sentiment_analyzer = pipeline('sentiment-analysis', model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def truncate_text(text, tokenizer, max_length=512):
    # Use tokenizer to encode the text and ensure truncation
    inputs = tokenizer(text, max_length=max_length, truncation=True, return_tensors='pt')
    # Decode back to string to verify truncation
    truncated_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
    return truncated_text

# Define a function to analyze sentiment
def analyze_sentiment(text):
    truncated_text = truncate_text(text, tokenizer)
    result = sentiment_analyzer(truncated_text)[0]
    return result['label'], result['score']

# Apply sentiment analysis to the 'section_text_clean' column
combined_df['sentiment'] = combined_df['section_text_clean'].apply(lambda x: analyze_sentiment(x))

# Display the first few rows of the DataFrame with the sentiment analysis results
combined_df[['section_text_clean', 'sentiment']].head()




Unnamed: 0,section_text_clean,sentiment
0,patient yearold white female presents clinic today originally hypertension med check history hypertension osteoarthritis osteoporosis hypothyroidism allergic rhinitis kidney stones since last visit followed dr kumar issues stable fever chills cough congestion nausea vomiting chest pain chest pressure,"(NEGATIVE, 0.9706116318702698)"
1,patient yearold righthanded caucasian female presented emergency department sudden onset headache occurring approximately morning july described headache worse life also accompanied blurry vision scotoma patient also perceived swelling face emergency department patient underwent thorough evaluation examination given migraine cocktail also given morphine total mg emergency department full details history present illness please see previous history physical,"(NEGATIVE, 0.9874310493469238)"
2,yearold female presented office complaining condylomas noted anal region noticed approximately three four weeks ago denies pain state itching symptoms associated,"(NEGATIVE, 0.9842221140861511)"
3,prescribed medications salmeterol inhaler prn fluticasone nasal inhaler patient taking counter alternative medicines,"(NEGATIVE, 0.979263961315155)"
4,burn right arm,"(NEGATIVE, 0.9915681481361389)"


## Relation Extraction

In [23]:
""" import spacy
from spacy.tokens import Span
from spacy.matcher import Matcher

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

def extract_relations(doc):
    # Define the pattern for relation extraction
    pattern = [{'DEP': 'nsubj'}, {'DEP': 'ROOT'}, {'DEP': 'dobj'}]
    
    # Initialize the matcher with the pattern
    matcher = Matcher(nlp.vocab)
    matcher.add('relation_pattern', [pattern])
    
    # Apply the matcher to the doc
    matches = matcher(doc)
    relations = []
    
    for match_id, start, end in matches:
        span = doc[start:end]
        relations.append((span.text, span.root.head.text))
    
    return relations

# Apply relation extraction to the 'section_text_clean' column
combined_df['relations'] = combined_df['section_text_clean'].apply(lambda x: extract_relations(nlp(x)))

# Display the first few rows of the DataFrame with the relation extraction results
combined_df[['section_text_clean', 'relations']].head()
 """

Unnamed: 0,section_text_clean,relations
0,patient yearold white female presents clinic today originally hypertension med check history hypertension osteoarthritis osteoporosis hypothyroidism allergic rhinitis kidney stones since last visit followed dr kumar issues stable fever chills cough congestion nausea vomiting chest pain chest pressure,[]
1,patient yearold righthanded caucasian female presented emergency department sudden onset headache occurring approximately morning july described headache worse life also accompanied blurry vision scotoma patient also perceived swelling face emergency department patient underwent thorough evaluation examination given migraine cocktail also given morphine total mg emergency department full details history present illness please see previous history physical,[]
2,yearold female presented office complaining condylomas noted anal region noticed approximately three four weeks ago denies pain state itching symptoms associated,[]
3,prescribed medications salmeterol inhaler prn fluticasone nasal inhaler patient taking counter alternative medicines,[]
4,burn right arm,[]


In [14]:
import spacy
from spacy.tokens import Span

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

def extract_relations(doc):
    relations = []
    for token in doc:
        if token.dep_ in ('nsubj', 'dobj'):
            subject = [w for w in token.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relation = (subject.text, token.head.text, token.text)
                relations.append(relation)
    return relations

# Apply relation extraction to the 'section_text_clean' column
combined_df['relations'] = combined_df['section_text_clean'].apply(lambda x: extract_relations(nlp(x)))

# Display the first few rows of the DataFrame with the relation extraction results
combined_df[['section_text_clean', 'relations']].head()


Unnamed: 0,section_text_clean,relations
0,patient yearold white female presents clinic today originally hypertension med check history hypertension osteoarthritis osteoporosis hypothyroidism allergic rhinitis kidney stones since last visit followed dr kumar issues stable fever chills cough congestion nausea vomiting chest pain chest pressure,"[(clinic, check, clinic), (clinic, check, med), (clinic, check, stones), (chills, cough, chills), (nausea, vomiting, nausea), (nausea, vomiting, pressure)]"
1,patient yearold righthanded caucasian female presented emergency department sudden onset headache occurring approximately morning july described headache worse life also accompanied blurry vision scotoma patient also perceived swelling face emergency department patient underwent thorough evaluation examination given migraine cocktail also given morphine total mg emergency department full details history present illness please see previous history physical,"[(yearold, presented, yearold), (headache, described, headache), (life, accompanied, life), (life, accompanied, vision), (patient, perceived, patient), (patient, underwent, patient), (examination, given, examination), (examination, given, illness)]"
2,yearold female presented office complaining condylomas noted anal region noticed approximately three four weeks ago denies pain state itching symptoms associated,"[(condylomas, noted, condylomas), (region, noticed, region), (symptoms, associated, symptoms)]"
3,prescribed medications salmeterol inhaler prn fluticasone nasal inhaler patient taking counter alternative medicines,"[(patient, taking, patient), (patient, taking, medicines)]"
4,burn right arm,[]


## Topic Modeling

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Define the number of topics
num_topics = 5

# Vectorize the text data
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(combined_df['section_text_clean'])

# Apply LDA for topic modeling
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

# Display the top words for each topic
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

# Display the topics
num_top_words = 10
display_topics(lda, vectorizer.get_feature_names_out(), num_top_words)


Topic 0:
use patient drug alcohol history denies lives years married tobacco
Topic 1:
pain patient left right states yearold dr time knee symptoms
Topic 2:
history disease negative died age cancer diabetes father mother hypertension
Topic 3:
surgery patient pain risks yearold symptoms noted sensation treatment extremity
Topic 4:
patient today yearold denies time past states pain mg history


In [20]:
!pip install gensim

import nltk
nltk.download('punkt')

import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Prepare the text for LDA
def prepare_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Tokenize the text
combined_df['tokens'] = combined_df['section_text_clean'].apply(prepare_text)

# Create Dictionary and Corpus needed for Topic Modeling
id2word = corpora.Dictionary(combined_df['tokens'])
texts = combined_df['tokens']
corpus = [id2word.doc2bow(text) for text in texts]

# Build the LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=5, random_state=42, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True)

# Extract the topics
topics = lda_model.print_topics(num_words=5)

# Display the topics
for topic in topics:
    print(topic)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


(0, '0.034*"pain" + 0.015*"patient" + 0.014*"left" + 0.012*"yearold" + 0.011*"negative"')
(1, '0.087*"aspirin" + 0.063*"mg" + 0.061*"patient" + 0.060*"upper" + 0.058*"occurrence"')
(2, '0.100*"gi" + 0.099*"bleed" + 0.051*"atrial" + 0.051*"fibrillation" + 0.050*"melena"')
(3, '0.036*"patient" + 0.018*"pain" + 0.018*"history" + 0.013*"states" + 0.013*"denies"')
(4, '0.015*"right" + 0.013*"patient" + 0.010*"well" + 0.009*"none" + 0.008*"tobacco"')


In [None]:
# Results Summary with Topic Modeling
"""
This cell creates a summary of all the results obtained from the various NLP tasks
(Named Entity Recognition, Sentiment Analysis, Relation Extraction, and Topic Modeling).
It provides insights into the analysis of synthetic clinical notes related to sepsis.
"""

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Display summary of Named Entity Recognition (NER)
print("Named Entity Recognition (NER) Results:")
print(combined_df[['illness_clean', 'entities']].head(), "\n")

# Display summary of Sentiment Analysis
print("Sentiment Analysis Results:")
print(combined_df[['illness_clean', 'sentiment']].head(), "\n")

# Display summary of Relation Extraction
print("Relation Extraction Results:")
print(combined_df[['illness_clean', 'relationships']].head(), "\n")

# Define a function to display topics
def display_topics(model, feature_names, no_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        topics[topic_idx] = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topics

# Apply CountVectorizer to the 'illness_clean' column
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
count_data = count_vectorizer.fit_transform(combined_df['illness_clean'])

# Apply Latent Dirichlet Allocation (LDA) for topic modeling
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(count_data)

# Display the topics
no_top_words = 10
topics = display_topics(lda, count_vectorizer.get_feature_names_out(), no_top_words)
print("Topic Modeling Results:")
for topic, words in topics.items():
    print(f"Topic {topic}: {', '.join(words)}")


In [21]:
# Display summary of Named Entity Recognition (NER)
print("Named Entity Recognition (NER) Results:")
print(combined_df[['section_text_clean', 'entities']].head(), "\n")

# Display summary of Sentiment Analysis
print("Sentiment Analysis Results:")
print(combined_df[['section_text_clean', 'sentiment']].head(), "\n")

# Display summary of Relation Extraction
print("Relation Extraction Results:")
print(combined_df[['section_text_clean', 'specific_relations', 'general_relations']].head(), "\n")

# Define a function to display topics
def display_topics(model, feature_names, no_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        topics[topic_idx] = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topics

# Apply CountVectorizer to the 'section_text_clean' column
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
count_data = count_vectorizer.fit_transform(combined_df['section_text_clean'])

# Apply Latent Dirichlet Allocation (LDA) for topic modeling
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(count_data)

# Display the topics
no_top_words = 10
topics = display_topics(lda, count_vectorizer.get_feature_names_out(), no_top_words)
print("Topic Modeling Results:")
for topic, words in topics.items():
    print(f"Topic {topic}: {', '.join(words)}")


Named Entity Recognition (NER) Results:
                                                                                                                                                                                                                                                                                                                                                                                                                                                            section_text_clean  \
0                                                                                                                                                                patient yearold white female presents clinic today originally hypertension med check history hypertension osteoarthritis osteoporosis hypothyroidism allergic rhinitis kidney stones since last visit followed dr kumar issues stable fever chills cough congestion nausea vomiting chest pain chest pressure   
1  patient yearold righthand

KeyError: "['specific_relations', 'general_relations'] not in index"