In [None]:
#  INSTALL IMPORT LIBRARIES
!pip install nbstripout
!nbstripout Summarization_of_Medical_Report.ipynb
!pip install bertopic[visualization] sentence-transformers pandas scikit-learn nltk --quiet
!pip install seaborn matplotlib pandas

In [None]:
# Importing Libraries
import pandas as pd
import re
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
nltk.download('punkt')

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize.punkt import PunktParameters, PunktSentenceTokenizer
from nltk.tokenize import sent_tokenize

In [None]:
# LOAD DATA
df = pd.read_csv("mtsamples.csv")

In [None]:
# Remove empty/missing transcriptions
df = df[df['transcription'].notnull() & (df['transcription'].str.strip() != '')]
documents = df['transcription'].tolist()

In [None]:
# INITIALIZE BERTopic
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
vectorizer_model = CountVectorizer(stop_words="english")

topic_model = BERTopic(embedding_model=embedding_model, vectorizer_model=vectorizer_model)

In [None]:
# Fit model on all documents
topics, probs = topic_model.fit_transform(documents)

In [None]:
# VIEW TOPICS + EXPORT
top_topics = topic_model.get_topic_info().head(10)
print("Top 10 Topics:")
print(top_topics)

In [None]:
#  VISUALIZATIONS
topic_model.visualize_topics()       # Cluster map

In [None]:
topic_model.visualize_heatmap()      # Similarity heatmap

In [None]:
topic_model.visualize_barchart(top_n_topics=10)  # Top 10 topics bar chart

In [None]:
# TREATMENT MENTION EXTRACTION + CRITICALITY ANALYSIS

# To check the first 30
subset_df = df.head(30).copy()
subset_docs = subset_df['transcription'].tolist()

In [None]:
# Get topic and probability for subset
subset_topics, subset_probs = topic_model.transform(subset_docs)

In [None]:
# Treatment-related keyword patterns
treatment_keywords = [
    r'\bmedicat(ed|ion|ions)?\b', r'\bprescribed\b', r'\badminister(ed|ing)?\b',
    r'\bdose\b', r'\btherapy\b', r'\binjection\b', r'\bsurgery\b',
    r'\btreatment\b', r'\bprocedure\b', r'\bchemotherapy\b', r'\bradiation\b'
]

In [None]:
# Initialize sentence tokenizer manually (avoid punkt_tab error)
punkt_param = PunktParameters()
tokenizer = PunktSentenceTokenizer(punkt_param)

In [None]:
# Function to extract treatment mentions
def extract_treatments(text):
    sentences = tokenizer.tokenize(text)
    matched_sentences = [s for s in sentences if any(re.search(pat, s, re.IGNORECASE) for pat in treatment_keywords)]
    return matched_sentences if matched_sentences else ["No treatment mentioned"]

In [None]:
# Function to extract criticality score from topic probability
def get_criticality(prob):
    return float(prob) if prob is not None else 0.0

In [None]:
# Apply treatment extraction and severity scoring
subset_df['treatment_mentions'] = subset_docs
subset_df['treatment_mentions'] = subset_df['treatment_mentions'].apply(extract_treatments)
subset_df['criticality_score'] = [get_criticality(p) for p in subset_probs]
subset_df['topic'] = subset_topics

In [None]:
# Display and save
display_columns = ['description', 'treatment_mentions', 'criticality_score', 'topic']
display(subset_df[display_columns])

In [None]:
# Group by topic and compute average criticality
topic_group = subset_df.groupby("topic")["criticality_score"].mean().reset_index()

plt.figure(figsize=(8,6))
heatmap_data = topic_group.pivot_table(values="criticality_score", index="topic", aggfunc="mean")
sns.heatmap(heatmap_data, annot=True, cmap="Reds", cbar_kws={'label': 'Avg Criticality'})
plt.title("Average Criticality Score by Topic", fontsize=14, fontweight="bold")
plt.ylabel("Topic ID")
plt.xlabel("")
plt.show()

In [None]:
# Sort all 30 cases by criticality
all_cases = subset_df.sort_values(by="criticality_score", ascending=False)

plt.figure(figsize=(10,12))
sns.barplot(x="criticality_score", y="description", data=all_cases, palette="viridis")
plt.title("All 30 Cases by Criticality Score", fontsize=14, fontweight="bold")
plt.xlabel("Criticality Score")
plt.ylabel("Case Description")
plt.xlim(0,1.05)
plt.show()