In [None]:
!pip install -q spacy transformers torch
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
from transformers import pipeline
import json

print("Loading models...")

nlp = spacy.load("en_core_web_sm")

# Public + stable summarizer
summarizer = pipeline(
    "summarization",
    model="google/pegasus-xsum"
)

# Sentiment classifier
sentiment_model = pipeline(
    "text-classification",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

print("Models ready ✔")




Loading models...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Models ready ✔


In [None]:
def extract_patient_name(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    return "Unknown"


def keyword_extract(text, top_n=10):
    doc = nlp(text)
    chunks = [c.text for c in doc.noun_chunks if len(c.text) > 3]
    return list(dict.fromkeys(chunks))[:top_n]


In [None]:
SYMPTOM_PATTERNS   = ["pain","ache","stiffness","hurt","discomfort","injury","impact"]
TREATMENT_PATTERNS = ["physiotherapy","session","painkiller","medication","therapy"]
DIAGNOSIS_PATTERNS = ["whiplash","strain","fracture"]
PROGNOSIS_PATTERNS = ["recovery","improving","full recovery","no long-term damage"]

def extract_entities(text):
    doc = nlp(text.lower())

    def collect(patterns):
        matches = set()
        for token in doc:
            for p in patterns:
                if p in token.text:
                    span = " ".join(t.text for t in token.subtree)
                    matches.add(span.strip())
        return list(matches)

    return {
        "Symptoms":  collect(SYMPTOM_PATTERNS),
        "Treatment": collect(TREATMENT_PATTERNS),
        "Diagnosis": collect(DIAGNOSIS_PATTERNS),
        "Prognosis": collect(PROGNOSIS_PATTERNS)
    }


In [None]:
def build_structured_summary(text):
    entities = extract_entities(text)

    def safe(v):
        return v if v else "Not clearly stated"

    return {
        "Patient_Name": extract_patient_name(text),
        "Symptoms": safe(entities["Symptoms"]),
        "Diagnosis": safe(entities["Diagnosis"]),
        "Treatment": safe(entities["Treatment"]),
        "Current_Status": "Occasional backache"
            if "occasional" in text.lower()
            else "Not clearly stated",
        "Prognosis": safe(entities["Prognosis"])
    }


In [None]:
def summarize_text(text):
    return summarizer(
        text,
        max_length=160,
        min_length=50,
        do_sample=False
    )[0]["summary_text"]


In [None]:
import re

def extract_patient_lines(text):
    lines = []
    for line in text.split("\n"):
        line = line.strip()
        if line.lower().startswith("patient"):
            lines.append(re.sub(r"^patient[: ]", "", line, flags=re.I))
    return lines


INTENT_RULES = {
    "Seeking reassurance": ["worried","concern","hope"],
    "Reporting symptoms": ["pain","ache","hurt","stiffness"],
    "Recovery update": ["better","improving","recovering"]
}


def classify_patient_sentiment(text):
    patient_lines = extract_patient_lines(text)
    sentiments = []

    for line in patient_lines:
        if not line.strip():
            continue

        truncated = line[:2000]  # keep under transformer limit
        label = sentiment_model(truncated)[0]["label"]

        if label == "NEGATIVE":
            sentiments.append("Anxious")
        else:
            sentiments.append("Reassured" if "better" in line.lower() else "Neutral")

    if "Anxious" in sentiments:
        return "Anxious"
    if "Reassured" in sentiments:
        return "Reassured"
    return "Neutral"


def detect_intent(text):
    joined = " ".join(extract_patient_lines(text)).lower()

    for intent, keys in INTENT_RULES.items():
        if any(k in joined for k in keys):
            return intent

    return "General conversation"


def sentiment_intent_json(text):
    return {
        "Sentiment": classify_patient_sentiment(text),
        "Intent": detect_intent(text)
    }


In [None]:
def generate_SOAP(text):
    entities = extract_entities(text)

    return {
        "Subjective": {
            "Chief_Complaint": ", ".join(entities["Symptoms"]) or "Not specified",
            "History_of_Present_Illness": summarize_text(text)
        },
        "Objective": {
            "Physical_Exam":
                "Full range of motion, no tenderness"
                if "full range" in text.lower()
                else "Not documented",
            "Observations": "Patient appears in normal health"
        },
        "Assessment": {
            "Diagnosis": ", ".join(entities["Diagnosis"]) or "Not specified",
            "Severity": "Mild, improving"
                if "improving" in text.lower()
                else "Not specified"
        },
        "Plan": {
            "Treatment": ", ".join(entities["Treatment"]) or "Not specified",
            "Follow-Up": "Return if symptoms worsen or persist"
        }
    }


In [None]:
def run_pipeline(text):
    print("\n=== STRUCTURED SUMMARY ===")
    print(json.dumps(build_structured_summary(text), indent=2))

    print("\n=== KEYWORDS ===")
    print(keyword_extract(text))

    print("\n=== SENTIMENT & INTENT ===")
    print(json.dumps(sentiment_intent_json(text), indent=2))

    print("\n=== SOAP NOTE ===")
    print(json.dumps(generate_SOAP(text), indent=2))

print("\nReady ✔  Paste transcript in next cell and call run_pipeline(text)\n")



Ready ✔  Paste transcript in next cell and call run_pipeline(text)



In [None]:
text = """
Doctor: How are you feeling today?
Patient: I had a car accident. My neck and back hurt a lot for four weeks.
Doctor: Did you receive treatment?
Patient: Yes, I had ten physiotherapy sessions, and now I only have occasional back pain.


"""

run_pipeline(text)



=== STRUCTURED SUMMARY ===
{
  "Patient_Name": "Unknown",
  "Symptoms": [
    "occasional back pain",
    "back hurt a lot for four weeks ."
  ],
  "Diagnosis": "Not clearly stated",
  "Treatment": [
    "ten physiotherapy sessions",
    "physiotherapy"
  ],
  "Current_Status": "Occasional backache",
  "Prognosis": "Not clearly stated"
}

=== KEYWORDS ===
['a car accident', 'My neck', 'four weeks', 'Doctor', 'treatment', 'Patient', 'ten physiotherapy sessions', 'occasional back pain']

=== SENTIMENT & INTENT ===


Your max_length is set to 160, but your input_length is only 55. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)


{
  "Sentiment": "Anxious",
  "Intent": "Reporting symptoms"
}

=== SOAP NOTE ===
{
  "Subjective": {
    "Chief_Complaint": "occasional back pain, back hurt a lot for four weeks .",
    "History_of_Present_Illness": "Patient: I'm in my 70s and I've been to the doctor once a week for the last five years because of back and neck pain.. patient: Yes, I had ten physiotherapy sessions, and now I only have occasional back pain."
  },
  "Objective": {
    "Physical_Exam": "Not documented",
    "Observations": "Patient appears in normal health"
  },
  "Assessment": {
    "Diagnosis": "Not specified",
    "Severity": "Not specified"
  },
  "Plan": {
    "Treatment": "ten physiotherapy sessions, physiotherapy",
    "Follow-Up": "Return if symptoms worsen or persist"
  }
}
