<a href="https://colab.research.google.com/github/VrindaBajaj20/Medical_Report_sum/blob/main/Medical_Report_Summarizer_persipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Download libraries

In [None]:
# Install necessary packages (run once)
!pip install spacy pytesseract pdf2image python-dateutil pandas PyMuPDF
!python -m spacy download en_core_web_sm
!sudo apt-get install tesseract-ocr -y

##import libraries

In [None]:
import re
import spacy
from spacy.matcher import Matcher
from datetime import datetime
from dateutil import parser
from typing import Dict, List
import pandas as pd
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from IPython.display import display, Markdown

##extractor and summarizer

In [None]:
class MedicalReportProcessor:
    def __init__(self):
        # Load SpaCy English model
        self.nlp = spacy.load("en_core_web_sm")

        # Initialize matcher for medical terms (pain, meds, procedures)
        self.matcher = Matcher(self.nlp.vocab)
        self._add_medical_patterns()

        # Regex templates for structured extraction (made more flexible)
        self.templates = {
            'demographics': {
                'name': r"Patient Name\s*[:\-]\s*(.*?)\s*(?:Referring MD|Doctor|Physician|$)",
                'dob': r"Date of Birth\s*[:\-]\s*([\d/.-]+)",
                'age': r"Age\s*[:\-]\s*(\d{1,3})",
                'gender': r"Gender\s*[:\-]\s*(Male|Female|Other|M|F)",
                'physician': r"Referring MD\s*[:\-]\s*(.*?)\n"
            },
            'diagnosis': {
                'primary': r"DIAGNOSIS\s*[:\-]\s*(.*?)\n",
                'icd_code': r"ICD-10\s*[:\-]\s*([A-Z]\d{2}\.?\d*)"
            },
            'treatment': {
                'plan': r"PLAN\s*[:\-]\s*(.*?)(?=\n\n|$)",
                'medications': r"Current Medications\s*[:\-]\s*(.*?)\n"
            },
            'progress': {
                'initial': r"Initial Condition.*?[:\-]\s*(.*?)\n",
                'current': r"Current Status.*?[:\-]\s*(.*?)\n",
                'goals': r"Goals.*?[:\-]\s*(.*?)(?=\n\n|$)"
            }
        }

    def _add_medical_patterns(self):
        # Patterns for pain symptoms
        pain_patterns = [
            [{"LOWER": "pain"}],
            [{"LOWER": "discomfort"}],
            [{"LOWER": "ache"}],
            [{"LOWER": "burning"}],
            [{"LOWER": "throbbing"}]
        ]
        self.matcher.add("PAIN", pain_patterns)

        # Simple patterns for medications (assuming drug names are capitalized entities or ending with 'ine', 'ol', etc.)
        med_patterns = [
            [{"ENT_TYPE": "DRUG"}],  # SpaCy builtin might detect some drugs
            [{"LOWER": {"REGEX": "(aspirin|ibuprofen|paracetamol|metformin|amoxicillin|atorvastatin)"} }],
            [{"LOWER": {"REGEX": ".*ine$|.*ol$|.*cin$"}}],  # basic drug suffixes
        ]
        self.matcher.add("MEDICATION", med_patterns)

        # Procedure keywords
        proc_patterns = [
            [{"LOWER": "surgery"}],
            [{"LOWER": "injection"}],
            [{"LOWER": "therapy"}],
            [{"LOWER": "operation"}],
            [{"LOWER": "procedure"}]
        ]
        self.matcher.add("PROCEDURE", proc_patterns)

    def extract_structured_data(self, text: str) -> Dict:
        """Extract structured data using regex patterns with flexible matching."""
        results = {}
        for section, fields in self.templates.items():
            results[section] = {}
            for field, pattern in fields.items():
                match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
                if match:
                    value = match.group(1).strip()
                    # Clean common trailing characters/spaces
                    value = re.sub(r'[\n\r]+', ' ', value)
                    results[section][field] = value
                else:
                    results[section][field] = None  # Explicitly mark missing fields
        return results

    def analyze_unstructured_text(self, text: str) -> Dict:
        """Analyze unstructured text with spaCy matcher and entities."""
        doc = self.nlp(text)
        matches = self.matcher(doc)

        entities = {
            "symptoms": set(),
            "medications": set(),
            "procedures": set(),
            "dates": set(),
        }

        # Collect matches
        for match_id, start, end in matches:
            label = self.nlp.vocab.strings[match_id]
            span = doc[start:end].text.strip()
            if label == "PAIN":
                entities["symptoms"].add(span)
            elif label == "MEDICATION":
                entities["medications"].add(span)
            elif label == "PROCEDURE":
                entities["procedures"].add(span)

        # Also extract spaCy's DATE entities
        for ent in doc.ents:
            if ent.label_ == "DATE":
                entities["dates"].add(ent.text.strip())

        # Extract top 5 key sentences containing medical terms or entities
        key_sentences = []
        for sent in doc.sents:
            sent_text = sent.text.strip()
            if any(token.ent_type_ in ["DRUG"] for token in sent):
                key_sentences.append(sent_text)
                continue
            # Or contains medical terms
            if any(word.lower_ in entities["symptoms"] or word.lower_ in entities["procedures"] for word in sent):
                key_sentences.append(sent_text)
            if len(key_sentences) >= 5:
                break

        return {
            "entities": {k: list(v) for k, v in entities.items()},
            "key_sentences": key_sentences
        }

    def generate_summary(self, structured_data: Dict, analysis: Dict) -> Dict:
        """Generate a clean, comprehensive summary dictionary."""
        def safe_get(d, key):
            return d.get(key) if d else None

        summary = {
            "patient_info": structured_data.get("demographics", {}),
            "medical_condition": {
                "diagnosis": safe_get(structured_data.get("diagnosis", {}), "primary") or "Not specified",
                "icd_code": safe_get(structured_data.get("diagnosis", {}), "icd_code") or "N/A",
                "symptoms": analysis["entities"].get("symptoms", [])
            },
            "treatment": {
                "plan": safe_get(structured_data.get("treatment", {}), "plan") or "Not specified",
                "medications": analysis["entities"].get("medications", []),
                "procedures": analysis["entities"].get("procedures", [])
            },
            "progress": {
                "initial_condition": safe_get(structured_data.get("progress", {}), "initial") or "Not specified",
                "current_status": safe_get(structured_data.get("progress", {}), "current") or "Not specified",
                "goals": safe_get(structured_data.get("progress", {}), "goals") or "Not specified",
                "key_findings": analysis.get("key_sentences", [])
            },
            "timeline": {
                "dates": analysis["entities"].get("dates", []),
                "age": safe_get(structured_data.get("demographics", {}), "age") or "N/A"
            }
        }
        return summary

    def process_report(self, text: str) -> Dict:
        """Complete end-to-end processing of medical report text."""
        structured_data = self.extract_structured_data(text)
        analysis = self.analyze_unstructured_text(text)
        summary = self.generate_summary(structured_data, analysis)
        return summary

    def generate_human_readable(self, summary: Dict) -> str:
        """Generate a readable markdown report."""
        output = f"# Medical Summary Report\n\n"
        pi = summary.get('patient_info', {})
        mc = summary.get('medical_condition', {})
        tr = summary.get('treatment', {})
        pr = summary.get('progress', {})
        tl = summary.get('timeline', {})

        output += f"## Patient Demographics\n"
        output += f"- **Name:** {pi.get('name') or 'N/A'}\n"
        output += f"- **Date of Birth:** {pi.get('dob') or 'N/A'}\n"
        output += f"- **Age:** {pi.get('age') or 'N/A'}\n"
        output += f"- **Gender:** {pi.get('gender') or 'N/A'}\n"
        output += f"- **Referring Physician:** {pi.get('physician') or 'N/A'}\n\n"

        output += f"## Medical Condition\n"
        output += f"- **Diagnosis:** {mc.get('diagnosis')}\n"
        output += f"- **ICD Code:** {mc.get('icd_code')}\n"
        output += f"- **Symptoms:** {', '.join(mc.get('symptoms') or ['N/A'])}\n\n"

        output += f"## Treatment Plan\n"
        output += f"- **Plan:** {tr.get('plan')}\n"
        output += f"- **Medications:** {', '.join(tr.get('medications') or ['N/A'])}\n"
        output += f"- **Procedures:** {', '.join(tr.get('procedures') or ['N/A'])}\n\n"

        output += f"## Patient Progress\n"
        output += f"- **Initial Condition:** {pr.get('initial_condition')}\n"
        output += f"- **Current Status:** {pr.get('current_status')}\n"
        output += f"- **Treatment Goals:** {pr.get('goals')}\n\n"

        output += f"## Key Clinical Findings\n"
        if pr.get('key_findings'):
            for i, finding in enumerate(pr['key_findings'], 1):
                output += f"{i}. {finding}\n"
        else:
            output += "No key findings detected.\n"

        output += f"\n## Timeline and Other Info\n"
        output += f"- **Dates mentioned:** {', '.join(tl.get('dates') or ['N/A'])}\n"
        output += f"- **Age:** {tl.get('age')}\n"

        return output

##pdf processor

In [None]:
class DocumentProcessor:
    @staticmethod
    def pdf_to_text(pdf_path: str) -> str:
        """
        Extract text from PDF using PyMuPDF.
        If text extraction is poor, fallback to OCR.
        """
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            page_text = page.get_text()
            text += page_text + "\n"

        # Heuristic: if extracted text is too short or empty, fallback to OCR
        if len(text.strip()) < 100:
            print("[INFO] Text extraction insufficient, using OCR fallback...")
            text = DocumentProcessor.ocr_pdf(pdf_path)
        return text

    @staticmethod
    def ocr_pdf(pdf_path: str) -> str:
        """Perform OCR on each page of PDF using pytesseract."""
        pages = convert_from_path(pdf_path, dpi=300)
        text = ""
        for page_num, page_image in enumerate(pages):
            page_text = pytesseract.image_to_string(page_image)
            text += page_text + "\n"
        return text

    @staticmethod
    def clean_text(text: str) -> str:
        """Clean and normalize extracted text."""
        # Remove repeated blank lines
        text = re.sub(r'\n\s*\n', '\n\n', text)
        # Remove page headers/footers (common pattern)
        text = re.sub(r'Page\s*\d+\s*of\s*\d+', '', text, flags=re.IGNORECASE)
        text = re.sub(r'\f', '', text)  # remove form feed chars
        return text.strip()

##Implementation

In [15]:


def main():
    # Upload PDF file interactively in Colab
    from google.colab import files
    uploaded = files.upload()
    pdf_file = next(iter(uploaded.keys()))

    # Initialize processors
    doc_processor = DocumentProcessor()
    medical_processor = MedicalReportProcessor()

    # Extract and clean text
    raw_text = doc_processor.pdf_to_text(pdf_file)
    clean_text = doc_processor.clean_text(raw_text)

    # Process the report
    summary = medical_processor.process_report(clean_text)

    # Display intermediate outputs
    display(Markdown("### Extracted Text Sample"))
    display(Markdown(f"```\n{clean_text[:800]}...\n```"))

    display(Markdown("### Structured Summary Data"))

    # Flatten nested summary dict for display
    def flatten_dict(d, parent_key='', sep='_'):
        items = []
        for k, v in d.items():
            new_key = f"{parent_key}{sep}{k}" if parent_key else k
            if isinstance(v, dict):
                items.extend(flatten_dict(v, new_key, sep=sep).items())
            elif isinstance(v, list):
                items.append((new_key, ', '.join(str(i) for i in v)))
            else:
                items.append((new_key, str(v)))
        return dict(items)

    flat_summary = flatten_dict(summary)
    df_summary = pd.DataFrame(list(flat_summary.items()), columns=["Field", "Value"])
    display(df_summary)

    # Display human-readable summary
    display(Markdown("### Human-Readable Medical Summary Report"))
    report_md = medical_processor.generate_human_readable(summary)
    display(Markdown(report_md))


if __name__ == "__main__":
    main()


Saving Medical_Report_Summary.pdf to Medical_Report_Summary.pdf


### Extracted Text Sample

```
Initial Evaluation   

Date of Visit: 01-11-24   
Therapist: Amit Das   
Patient Name: Anish De   
Referring MD: Maria DO   
Patient #: 123456   
Time In:   
Date of Birth: 07-10-1969   
Time Out:   
Age: 54   
Certification Period: 01-11-24 / 02-21-24   

Total Time based minutes: 25   
Total Treatment time: 41   

Date of Onset: 07-22-22   

Diagnosis:   
S83,242D Other tear of medical meniscus, currently injury, left knee, subsequent 
encounter   

SUBJECTIVE:   
Anish De is a 54-year-old male who presents therapy today for evaluation of L knee pain. 
The patient was referred by Maria, DO. Injured Worker reports an injury to left knee while 
trying to put a 100lbs box on a conveyor belt.   

Medical treatment to the point includes 2 meniscus surgeries. Had surgery September 5th.   

Dif...
```

### Structured Summary Data

Unnamed: 0,Field,Value
0,patient_info_name,Anish De
1,patient_info_dob,07-10-1969
2,patient_info_age,54
3,patient_info_gender,
4,patient_info_physician,Maria DO
5,medical_condition_diagnosis,"S83,242D Other tear of medical meniscus, curre..."
6,medical_condition_icd_code,
7,medical_condition_symptoms,"pain, discomfort, PAIN"
8,treatment_plan,The patient’s treatment will include Therapeut...
9,treatment_medications,control


### Human-Readable Medical Summary Report

# Medical Summary Report

## Patient Demographics
- **Name:** Anish De
- **Date of Birth:** 07-10-1969
- **Age:** 54
- **Gender:** N/A
- **Referring Physician:** Maria DO

## Medical Condition
- **Diagnosis:** S83,242D Other tear of medical meniscus, currently injury, left knee, subsequent
- **ICD Code:** N/A
- **Symptoms:** pain, discomfort, PAIN

## Treatment Plan
- **Plan:** The patient’s treatment will include Therapeutic Exercise, Therapeutic Activities,  Neuromuscular Re-Education, Manual Pack as needed for swelling and/or pain control and  PT Eval Low Complexity.
- **Medications:** control
- **Procedures:** surgery, Therapy, therapy

## Patient Progress
- **Initial Condition:** Not specified
- **Current Status:** Not specified
- **Treatment Goals:** Long | Time-Frame | Result | Comment |    | Percent of goals Met | Long Term | 6 weeks | Initial | 0% |    | Patient will be able to perform stair negotiation without having discomfort in the knee |  Long Term | 6 weeks | Initial | |    | Patient will be able to increase strength to 4/5 to be able to stand and walk for a prolonged  period of time | Long Term | 6 weeks | Initial | |    | Patient will be able to squat with adequate form to retrieve an object from the floor | Long  Term | 6 weeks | Initial | |

## Key Clinical Findings
1. In:   
Date of Birth: 07-10-1969   
Time Out:   
Age: 54   
Certification Period: 01-11-24 / 02-21-24   

Total Time based minutes: 25   
Total Treatment time: 41   

Date of Onset: 07-22-22   

Diagnosis:   
S83,242D Other tear of medical meniscus, currently injury, left knee, subsequent 
encounter   

SUBJECTIVE:   
Anish De is a 54-year-old male who presents therapy today for evaluation of L knee pain.
2. Had surgery September 5th.
3. Presenting Problems:   
- PAIN TYPE: achy, dull, sore   
- PAIN FREQUENCY:   
- LOCATION OF PAIN: left knee   
- AGGRAVATING FACTORS: difficulty stairs, walking, standing   
- EASING FACTORS: some meds, elevation
4. Hip abd 
X 15 

Yes 

Suman 

Ball squeeze 
X 30 

Yes 

15916(P
T) 

97140-
Manual 
Therapy-
[TP007] 

12 
Yes 

15916(P
T) 

Manual 
intervention 
STM to 
the 
quad 
and HS 

Yes 

15916(P
T) 

Service-Based 
Service 
Comment 
Status 
Time 
Done Today 
PT Evaluation 
Low 
Complexity 
Eval 
Active 
16 
Yes 

ASSESSMENT:   

DIAGNOSIS: acute medical meniscus tear, subsequent encounter per MD   

MUSCULOSKELETAL LIMITATIONS: weakness, pain, increased tissue tension, impaired gait 
mechanics   

WORK RELATED LIMITATIONS:
5. | Percent of goals Met | Long Term | 6 weeks | Initial | 0% |   
| Patient will be able to perform stair negotiation without having discomfort in the knee | 
Long Term | 6 weeks | Initial | |

## Timeline and Other Info
- **Dates mentioned:** 6 weeks, Today, 97110-Ther-, 01-11-24, 6 
weeks, 54-year-old, today, 3+, 3+ 
5, September 5th
- **Age:** 54
