<a href="https://colab.research.google.com/github/RajatKumawat17/Physician_Notetaker/blob/main/Emitrr_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import spacy
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import warnings
from collections import defaultdict

warnings.filterwarnings('ignore')

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [2]:
try:
    nlp = spacy.load("en_core_web_md")
    print("SpaCy model loaded successfully")
except OSError:
    print("SpaCy model not found. Installing en_core_web_md...")
    import os
    os.system("python -m spacy download en_core_web_md")
    nlp = spacy.load("en_core_web_md")
    print("SpaCy model installed and loaded successfully")

SpaCy model loaded successfully


# Preprocessing and Conversation Parser


In [3]:
class ConversationParser:
    """
    A flexible parser for medical conversations that can handle various formats
    """
    def __init__(self, speaker_patterns=None):
        # Default speaker patterns to look for
        self.speaker_patterns = speaker_patterns or {
            'physician': ['physician:', 'doctor:', 'dr.:', 'provider:', 'physician', 'doctor', 'dr.', 'provider'],
            'patient': ['patient:', 'client:', 'pt:', 'patient', 'client', 'pt']
        }

    def _identify_speakers(self, conversation):
        """Identify the actual speakers used in this conversation"""
        lines = conversation.strip().split('\n')
        detected_patterns = set()

        # Check for any speaker pattern at the beginning of lines
        for line in lines:
            line = line.strip()
            if not line:
                continue

            for role, patterns in self.speaker_patterns.items():
                for pattern in patterns:
                    # Check if line starts with pattern (case insensitive)
                    if line.lower().startswith(pattern.lower()):
                        detected_patterns.add((role, pattern))

        return detected_patterns

    def _extract_patient_name(self, conversation):
        """
        Extract patient name from the conversation (if possible).
        This is a basic example and may need further refinement.
        """
        # For now, assume there's a line like "Patient: [Patient Name]"
        for line in conversation.split('\n'):
            if line.strip().lower().startswith("patient:"):
                try:
                    # Attempt to extract name after "Patient:"
                    name = line.strip().split(":", 1)[1].strip()
                    # If name contains only letters and spaces, consider it valid
                    if re.match(r"^[a-zA-Z\s]+$", name):
                        return name
                except IndexError:
                    pass  # Handle cases where name is not found
        return "Unknown Patient"  # Default if name extraction fails

    def parse(self, conversation):
        """
        Parse the conversation into structured format
        """
        # Identify how speakers are labeled
        detected_speakers = self._identify_speakers(conversation)

        # If no speakers are detected, try a more flexible approach
        if not detected_speakers:
            return self._parse_unlabeled_conversation(conversation)

        lines = conversation.strip().split('\n')
        physician_statements = []
        patient_statements = []
        metadata = {}
        current_role = None
        current_statement = []

        # Extract patient name
        metadata['patient_name'] = self._extract_patient_name(conversation)

        # Parse the conversation line by line
        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Check if this line starts a new speaker
            new_speaker_found = False
            for role, pattern in detected_speakers:
                # Create a more flexible pattern match
                if line.lower().startswith(pattern.lower()):
                    # Save previous statement if there was one
                    if current_role and current_statement:
                        statement = ' '.join(current_statement)
                        if current_role == 'physician':
                            physician_statements.append(statement)
                        else:
                            patient_statements.append(statement)

                    # Start new statement
                    current_role = role
                    # Remove the pattern from the beginning of the line
                    # Handle both cases: with and without colon
                    if pattern.endswith(':'):
                        statement_text = line[len(pattern):].strip()
                    else:
                        # Check if there's a colon after the pattern
                        colon_index = line.find(':', len(pattern))
                        if colon_index != -1:
                            statement_text = line[colon_index + 1:].strip()
                        else:
                            statement_text = line[len(pattern):].strip()

                    current_statement = [statement_text] if statement_text else []
                    new_speaker_found = True
                    break

            # If no new speaker, continue with current statement
            if not new_speaker_found and current_role is not None:
                current_statement.append(line)

        # Add the last statement
        if current_role and current_statement:
            statement = ' '.join(current_statement)
            if current_role == 'physician':
                physician_statements.append(statement)
            else:
                patient_statements.append(statement)

        # Look for physical examination or other special sections
        physical_exam = re.search(r'\[([^]]+(?:examination|exam|assessed|assessment)[^]]*)\]',
                                 conversation, re.IGNORECASE)
        if physical_exam:
            metadata['physical_examination'] = physical_exam.group(1)

        # Debug: If no patient statements found, try fallback method
        if not patient_statements:
            fallback_result = self._parse_unlabeled_conversation(conversation)
            return fallback_result

        # Combine into final result
        return {
            "physician_statements": physician_statements,
            "patient_statements": patient_statements,
            "all_physician_text": " ".join(physician_statements),
            "all_patient_text": " ".join(patient_statements),
            "metadata": metadata
        }

# Medical Entity Recognition System

In [4]:
class MedicalEntityRecognizer:
    """
    Dynamic medical entity recognition system with expandable categories
    """
    def __init__(self, nlp_model=None):
        self.nlp = nlp_model or nlp

        # Define medical categories with initial terms that can be expanded
        self.medical_categories = {
            "SYMPTOM": [
                "pain", "discomfort", "ache", "hurt", "sore", "stiffness", "trouble", "sleeping",
                "backache", "shocked", "swelling", "rash", "fever", "cough", "fatigue", "nausea",
                "dizziness", "headache", "difficulty", "shortness of breath", "chest pain",
                "weakness", "numbness", "tingling", "blurry vision"
            ],
            "TREATMENT": [
                "physiotherapy", "therapy", "painkillers", "medication", "advised", "sessions",
                "x-rays", "emergency", "treatment", "surgery", "prescription", "antibiotics",
                "procedure", "exercise", "physical therapy", "counseling", "radiation",
                "chemotherapy", "medicine", "pill", "tablet", "injection", "vaccine", "rehabilitation"
            ],
            "DIAGNOSIS": [
                "whiplash", "injury", "damage", "condition", "accident", "impact", "infection",
                "disease", "syndrome", "disorder", "fracture", "break", "sprain", "strain",
                "concussion", "diabetes", "hypertension", "cancer", "arthritis", "depression",
                "anxiety", "heart disease", "inflammation"
            ],
            "PROGNOSIS": [
                "recovery", "improve", "better", "improving", "full recovery", "future", "sign",
                "progress", "long-term", "outlook", "prognosis", "chance", "likely", "risk",
                "survival", "recurrence", "complication", "probability", "outcome", "expectation",
                "timeline", "chronic", "permanent", "temporary", "healing"
            ]
        }

        # Optional: Load medical terminology from external sources
        # self._load_external_terminology()

    def _load_external_terminology(self):
        """
        Load additional terminology from external sources like UMLS or SNOMED-CT
        (Placeholder function for future expansion)
        """
        pass

    def add_terms(self, category, terms):
        """Allow dynamic addition of terms to categories"""
        if category in self.medical_categories:
            self.medical_categories[category].extend(terms)
        else:
            self.medical_categories[category] = terms

    def identify_entities(self, text):
        """
        Use a combination of spaCy NER and rule-based approaches to identify medical entities
        """
        doc = self.nlp(text)
        entities = defaultdict(list)

        # First pass: Use spaCy's built-in NER
        for ent in doc.ents:
            if ent.label_ in ["DISEASE", "CONDITION", "CHEMICAL", "MEDICINE"]:
                if ent.text not in entities["DIAGNOSIS"]:
                    entities["DIAGNOSIS"].append(ent.text)

        # Second pass: Use custom dictionary-based approach with context
        for sent in doc.sents:
            sent_text = sent.text.lower()
            for category, terms in self.medical_categories.items():
                for term in terms:
                    term_lower = term.lower()
                    if term_lower in sent_text:
                        # Extract the context around the term (neighboring words)
                        pattern = r'(?i)(?:\w+\s+){0,3}' + re.escape(term_lower) + r'(?:\s+\w+){0,3}'
                        matches = re.findall(pattern, sent_text)
                        if matches:
                            for match in matches:
                                # Skip if match is negated (e.g., "no pain", "denied fever")
                                negation_words = ["no ", "not ", "didn't have ", "doesn't have ", "deny ", "denied "]
                                if any(neg in (" " + match.lower() + " ") for neg in negation_words):
                                    continue

                                # Add if not duplicate
                                clean_match = match.strip()
                                if clean_match and clean_match not in entities[category]:
                                    entities[category].append(clean_match)

        # Process and clean the extracted entities
        for category in entities:
            # Remove duplicates and sort by length (prefer longer, more specific phrases)
            entities[category] = sorted(list(set(entities[category])), key=len, reverse=True)

            # Remove overlapping terms (e.g., if "severe headache" and "headache" are both present, keep only "severe headache")
            final_terms = []
            for term in entities[category]:
                if not any(term in other_term and term != other_term for other_term in entities[category]):
                    final_terms.append(term)
            entities[category] = final_terms

        return dict(entities)


# Medical Report Generator

In [5]:
class MedicalReportGenerator:
    """
    Dynamic medical report generator that can adapt to different conversation styles
    """
    def __init__(self, entity_recognizer=None):
        self.entity_recognizer = entity_recognizer or MedicalEntityRecognizer()

    def extract_entities(self, conversation_data):
        """Extract entities from the conversation data"""
        # Prioritize patient statements for entity extraction
        text = conversation_data.get("all_patient_text", "")

        # Also include physician statements for context
        physician_text = conversation_data.get("all_physician_text", "")

        # Extract entities from patient text
        patient_entities = self.entity_recognizer.identify_entities(text)

        # Extract additional context from physician text
        physician_entities = self.entity_recognizer.identify_entities(physician_text)

        # Combine entities, prioritizing patient-reported ones
        combined_entities = {}
        for category in set(list(patient_entities.keys()) + list(physician_entities.keys())):
            patient_category_entities = patient_entities.get(category, [])
            physician_category_entities = physician_entities.get(category, [])

            # Include unique entities from both sources, prioritizing patient ones
            combined_entities[category] = patient_category_entities + [
                entity for entity in physician_category_entities
                if not any(entity in patient_entity for patient_entity in patient_category_entities)
            ]

        return combined_entities

    def extract_current_status(self, conversation_data):
        """Extract patient's current status from conversation"""
        patient_text = conversation_data.get("all_patient_text", "")

        # Look for statements about current condition
        status_patterns = [
            r"(?:now|currently|these days|at this point).{1,50}(?:feel|pain|discomfort|symptom)",
            r"(?:feeling|pain|discomfort|symptom).{1,50}(?:now|currently|these days|at this point)",
            r"(?:still|occasional|sometimes|intermittent).{1,30}(?:feel|pain|discomfort|symptom)"
        ]

        for pattern in status_patterns:
            matches = re.findall(pattern, patient_text, re.IGNORECASE)
            if matches:
                return matches[0].strip()

        # Look at most recent patient statement as fallback
        statements = conversation_data.get("patient_statements", [])
        if statements:
            recent_statements = statements[-3:]  # Look at last few statements
            for stmt in reversed(recent_statements):
                if any(word in stmt.lower() for word in ["now", "current", "still", "today", "feeling"]):
                    return stmt

        return "Current status unclear from conversation"

    def generate_report(self, conversation_data):
        """
        Generate a structured medical report from conversation data
        """
        # Extract entities
        entities = self.extract_entities(conversation_data)

        # Get patient name
        patient_name = conversation_data.get("metadata", {}).get("patient_name", "Unknown Patient")

        # Extract current status
        current_status = self.extract_current_status(conversation_data)

        # Extract or infer prognosis
        prognosis = "Prognosis unclear from conversation"
        if "PROGNOSIS" in entities and entities["PROGNOSIS"]:
            prognosis_terms = entities["PROGNOSIS"]
            if any("full recovery" in term for term in prognosis_terms):
                prognosis = "Full recovery expected"
            elif any("improve" in term for term in prognosis_terms):
                prognosis = "Condition expected to improve"

        # Create structured report
        report = {
            "Patient_Name": patient_name,
            "Symptoms": entities.get("SYMPTOM", []),
            "Diagnosis": entities.get("DIAGNOSIS", []),
            "Treatment": entities.get("TREATMENT", []),
            "Current_Status": current_status,
            "Prognosis": prognosis
        }

        return report

# Sentiment and Intent Analysis

In [6]:
class MedicalSentimentAnalyzer:
    """
    Analyzes patient sentiment and intent in medical conversations
    """
    def __init__(self, sentiment_model=None):
        # Load pre-trained sentiment analysis model
        if sentiment_model:
            self.sentiment_analyzer = sentiment_model
        else:
            try:
                self.sentiment_analyzer = pipeline(
                    "sentiment-analysis",
                    model="distilbert-base-uncased-finetuned-sst-2-english"
                )
            except:
                print("Warning: Could not load transformer model. Using rule-based analysis only.")
                self.sentiment_analyzer = None

        # Dictionaries for sentiment and intent classification
        self.sentiment_indicators = {
            "Anxious": [
                "worried", "concern", "afraid", "scared", "fear", "anxious", "nervous",
                "uncertain", "unsure", "stress", "scary", "terrified", "panic", "doubt"
            ],
            "Neutral": [
                "okay", "fine", "alright", "same", "unchanged", "stable", "constant",
                "regular", "normal", "usual", "routine", "moderate", "medium"
            ],
            "Reassured": [
                "better", "good", "great", "improve", "relief", "confident", "happy",
                "glad", "pleased", "relieved", "hopeful", "optimistic", "positive"
            ]
        }

        self.intent_indicators = {
            "Seeking reassurance": [
                "will i", "hope", "worried", "concern", "?", "wonder", "curious",
                "possible", "chance", "likelihood", "probability", "risk", "fear"
            ],
            "Reporting symptoms": [
                "pain", "hurt", "ache", "feel", "symptom", "discomfort", "noticed",
                "experiencing", "having", "suffered", "dealing with", "struggling with"
            ],
            "Seeking information": [
                "what is", "how does", "when will", "why does", "could you explain",
                "tell me about", "what causes", "how long", "what should", "how can"
            ],
            "Expressing gratitude": [
                "thank", "appreciate", "grateful", "glad", "helped", "useful",
                "informative", "valuable", "insightful"
            ]
        }

    def analyze_text_segment(self, text):
        """Analyze a segment of text for sentiment and intent"""
        # Default values
        sentiment = "Neutral"
        intent = "Sharing information"
        confidence = 0.5

        # Use transformer-based model if available
        if self.sentiment_analyzer:
            try:
                result = self.sentiment_analyzer(text)
                model_sentiment = result[0]['label']
                confidence = result[0]['score']

                # Map model sentiment to medical categories
                if model_sentiment == 'NEGATIVE' and confidence > 0.7:
                    sentiment = "Anxious"
                elif model_sentiment == 'POSITIVE' and confidence > 0.7:
                    sentiment = "Reassured"
                else:
                    sentiment = "Neutral"
            except Exception as e:
                print(f"Transformer model error: {e}")

        # Rule-based enhancement (regardless of whether model is used)
        text_lower = text.lower()

        # Check for sentiment indicators
        for sent_type, indicators in self.sentiment_indicators.items():
            if any(indicator in text_lower for indicator in indicators):
                sentiment = sent_type
                break

        # Check for intent indicators
        for intent_type, indicators in self.intent_indicators.items():
            if any(indicator in text_lower for indicator in indicators):
                intent = intent_type
                break

        return {
            "Sentiment": sentiment,
            "Intent": intent,
            "Confidence": confidence
        }

    def analyze_conversation(self, conversation_data):
        """
        Analyze the entire conversation for sentiment and intent patterns
        """
        patient_statements = conversation_data.get("patient_statements", [])

        # If no statements, return default
        if not patient_statements:
            return {
                "Overall_Sentiment": "Neutral",
                "Primary_Intent": "Unknown",
                "Sentiment_Progression": "Insufficient data for progression analysis",
                "Statement_Analysis": []
            }

        # Analyze each statement
        statement_results = []
        for statement in patient_statements:
            if len(statement.strip()) > 5:  # Skip very short statements
                result = self.analyze_text_segment(statement)
                statement_results.append({
                    "Statement": statement,
                    "Analysis": result
                })

        # Calculate overall sentiment and primary intent
        sentiment_counts = {"Anxious": 0, "Neutral": 0, "Reassured": 0}
        intent_counts = defaultdict(int)

        for result in statement_results:
            sentiment = result["Analysis"]["Sentiment"]
            intent = result["Analysis"]["Intent"]

            sentiment_counts[sentiment] += 1
            intent_counts[intent] += 1

        # Get most common sentiment and intent
        overall_sentiment = max(sentiment_counts, key=sentiment_counts.get)
        primary_intent = max(intent_counts, key=intent_counts.get) if intent_counts else "Unknown"

        # Initialize sentiment_progression with a default value
        sentiment_progression = "Insufficient data for progression analysis"

        # Look at sentiment progression (beginning, middle, end)
        if len(statement_results) >= 3:
            beginning = statement_results[0]["Analysis"]["Sentiment"]
            middle_idx = len(statement_results) // 2
            middle = statement_results[middle_idx]["Analysis"]["Sentiment"]
            end = statement_results[-1]["Analysis"]["Sentiment"]

            # Check for improvement pattern (anxious to reassured)
            if beginning == "Anxious" and end == "Reassured":
                sentiment_progression = "Improving (Anxious → Reassured)"
            # Check for deterioration pattern
            elif beginning == "Reassured" and end == "Anxious":
                sentiment_progression = "Deteriorating (Reassured → Anxious)"
            else:
                sentiment_progression = f"Stable ({beginning})"

        return {
            "Overall_Sentiment": overall_sentiment,
            "Primary_Intent": primary_intent,
            "Sentiment_Progression": sentiment_progression,
            "Statement_Analysis": statement_results
        }

# SOAP Note Generator

In [7]:
class SOAPNoteGenerator:
    """
    Generates structured SOAP notes from medical conversations
    """
    def __init__(self, entity_recognizer=None):
        self.entity_recognizer = entity_recognizer or MedicalEntityRecognizer()

    def _extract_chief_complaint(self, entities, conversation_data):
        """Extract the chief complaint from entities and conversation"""
        # First try to get from symptoms
        symptoms = entities.get("SYMPTOM", [])
        if symptoms:
            return ", ".join(symptoms[:2])  # Top two symptoms

        # Fall back to scanning patient statements
        patient_statements = conversation_data.get("patient_statements", [])
        for statement in patient_statements[:3]:  # Check first few statements
            doc = nlp(statement)
            for ent in doc.ents:
                if ent.label_ in ["DISEASE", "CONDITION", "SYMPTOM"]:
                    return ent.text

        return "Unspecified complaint"

    def _extract_history(self, conversation_data, entities):
        """Extract history of present illness"""
        patient_text = conversation_data.get("all_patient_text", "")

        # Look for temporal indicators of illness
        time_patterns = [
            r"(?:started|began|happened|occurred).{1,50}(?:ago|since|last|past|previous)",
            r"(?:ago|since|last|past|previous).{1,50}(?:started|began|happened|occurred)",
            r"(?:for).{1,20}(?:days|weeks|months|years)",
            r"(?:on|at).{1,30}(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)",
            r"(?:january|february|march|april|may|june|july|august|september|october|november|december).{1,20}\d{1,2}"
        ]

        history_fragments = []

        # Extract time-related information
        for pattern in time_patterns:
            matches = re.findall(pattern, patient_text, re.IGNORECASE)
            if matches:
                history_fragments.extend(matches)

        # Extract symptom descriptions with surrounding context
        symptoms = entities.get("SYMPTOM", [])
        for symptom in symptoms:
            pattern = r"[^.!?]*(?:" + re.escape(symptom) + r")[^.!?]*[.!?]"
            symptom_contexts = re.findall(pattern, patient_text, re.IGNORECASE)
            if symptom_contexts:
                history_fragments.extend(symptom_contexts[:1])  # Just take the first mention

        # If we have enough fragments, create a coherent history
        if history_fragments:
            history = " ".join(history_fragments[:3])  # Limit to first 3 fragments
            return history

        # Fallback: create a basic history
        diagnoses = entities.get("DIAGNOSIS", [])
        symptoms = entities.get("SYMPTOM", [])

        if diagnoses or symptoms:
            history = f"Patient reports "
            if symptoms:
                history += f"{', '.join(symptoms[:3])}"
                if diagnoses:
                    history += " with "
            if diagnoses:
                history += f"{', '.join(diagnoses[:2])}"
            return history

        return "Limited history available from conversation"

    def _extract_physical_exam(self, conversation_data):
        """Extract physical examination details"""
        # Check if metadata contains examination info
        exam_text = conversation_data.get("metadata", {}).get("physical_examination", "")
        if exam_text:
            return exam_text

        # Look for examination description in physician statements
        physician_statements = conversation_data.get("physician_statements", [])
        exam_indicators = ["examine", "examination", "physical", "test", "testing",
                         "check", "checked", "assess", "evaluated", "normal", "abnormal"]

        for statement in physician_statements:
            if any(indicator in statement.lower() for indicator in exam_indicators):
                return statement

        return "No detailed physical examination noted"

    def _determine_severity(self, entities, conversation_data):
        """Determine the severity of the condition"""
        patient_text = conversation_data.get("all_patient_text", "").lower()

        # Look for severity indicators
        severe_indicators = ["severe", "extreme", "terrible", "worst", "unbearable", "excruciating"]
        moderate_indicators = ["moderate", "significant", "considerable", "noticeable"]
        mild_indicators = ["mild", "slight", "minor", "little", "improving", "better"]

        if any(indicator in patient_text for indicator in severe_indicators):
            return "Severe"
        elif any(indicator in patient_text for indicator in moderate_indicators):
            return "Moderate"
        elif any(indicator in patient_text for indicator in mild_indicators):
            return "Mild, improving"

        return "Moderate"

    def _formulate_treatment_plan(self, entities, conversation_data):
        """Formulate treatment plan based on entities and conversation"""
        treatments = entities.get("TREATMENT", [])
        physician_text = conversation_data.get("all_physician_text", "").lower()

        # Look for treatment recommendations
        treatment_plan = []
        if treatments:
            treatment_plan.extend(treatments)

        # Look for follow-up instructions
        follow_up = "Return for follow-up as needed"
        follow_up_patterns = [
            r"(?:come back|return|follow.up|see me).{1,50}(?:if|when|in case)",
            r"(?:schedule|make).{1,30}(?:appointment|visit|follow.up)"
        ]

        for pattern in follow_up_patterns:
            matches = re.findall(pattern, physician_text, re.IGNORECASE)
            if matches:
                follow_up = matches[0]
                break

        return {
            "Treatment": ", ".join(treatment_plan) if treatment_plan else "Continue current treatment regimen",
            "Follow_Up": follow_up
        }

    def generate_soap_note(self, conversation_data):
        """Generate a SOAP note from conversation data"""
        # Extract entities first
        entities = self.entity_recognizer.identify_entities(
            conversation_data.get("all_patient_text", "") + " " +
            conversation_data.get("all_physician_text", "")
        )

        # Extract components
        chief_complaint = self._extract_chief_complaint(entities, conversation_data)
        history = self._extract_history(conversation_data, entities)
        physical_exam = self._extract_physical_exam(conversation_data)
        severity = self._determine_severity(entities, conversation_data)
        treatment_plan = self._formulate_treatment_plan(entities, conversation_data)

        # Determine diagnosis
        diagnoses = entities.get("DIAGNOSIS", [])
        diagnosis = ", ".join(diagnoses) if diagnoses else "Assessment pending further evaluation"

        # Create structured SOAP note
        soap_note = {
            "Subjective": {
                "Chief_Complaint": chief_complaint,
                "History_of_Present_Illness": history
            },
            "Objective": {
                "Physical_Exam": physical_exam,
                "Observations": "Based on patient report and examination"
            },
            "Assessment": {
                "Diagnosis": diagnosis,
                "Severity": severity
            },
            "Plan": {
                "Treatment": treatment_plan["Treatment"],
                "Follow_Up": treatment_plan["Follow_Up"]
            }
        }

        return soap_note

# Complete Pipeline

In [8]:
class MedicalNLPPipeline:
    """
    Complete end-to-end pipeline for medical conversation analysis
    """
    def __init__(self):
        self.parser = ConversationParser()
        self.entity_recognizer = MedicalEntityRecognizer()
        self.report_generator = MedicalReportGenerator(self.entity_recognizer)
        self.sentiment_analyzer = MedicalSentimentAnalyzer()
        self.soap_generator = SOAPNoteGenerator(self.entity_recognizer)

    def process(self, conversation):
        """
        Process a conversation through the complete pipeline
        """
        # Step 1: Parse the conversation
        conversation_data = self.parser.parse(conversation)

        # Step 2: Extract medical entities
        entities = self.entity_recognizer.identify_entities(
            conversation_data.get("all_patient_text", "") + " " +
            conversation_data.get("all_physician_text", "")
        )

        # Step 3: Generate medical report
        report = self.report_generator.generate_report(conversation_data)

        # Step 4: Analyze sentiment and intent
        sentiment_analysis = self.sentiment_analyzer.analyze_conversation(conversation_data)

        # Step 5: Generate SOAP note
        soap_note = self.soap_generator.generate_soap_note(conversation_data)

        # Combine all results into a comprehensive analysis
        analysis = {
            "conversation_data": conversation_data,
            "entities": entities,
            "report": report,
            "sentiment_analysis": sentiment_analysis,
            "soap_note": soap_note
        }

        return analysis

    def generate_json_output(self, analysis):
        """
        Generate a JSON representation of the analysis
        """
        return json.dumps(analysis, indent=2)

In [9]:
def test_pipeline():
    """
    Test the pipeline with a sample conversation
    """
    # Sample conversation
    sample_conversation = """
    Physician: Good morning. How are you feeling today?

    Patient: Not so great, doctor. My back has been killing me for the past two weeks after that car accident.

    Physician: I'm sorry to hear that. Can you describe the pain?

    Patient: It's a sharp pain in my lower back, especially when I try to bend over or get up from sitting. It's been getting slightly better over the last few days but still bothers me a lot.

    Physician: And any other symptoms along with the back pain?

    Patient: Sometimes I feel a tingling sensation down my left leg, and I've been having trouble sleeping because I can't find a comfortable position.

    Physician: [Physician examines the patient's back, checking for tenderness and mobility] I notice you have some inflammation and tenderness in the lumbar region.

    Physician: Based on your symptoms and the examination, you're experiencing lumbar strain from the accident. The tingling suggests some nerve irritation as well.

    Patient: Is this serious? Will I need surgery?

    Physician: No, I don't think surgery is necessary. This type of injury typically responds well to conservative treatment. I'm going to prescribe you some anti-inflammatory medication and muscle relaxants for the pain.

    Patient: That's a relief. How long until I feel better?

    Physician: Most people see significant improvement within 2-4 weeks with proper treatment. I'd also recommend some gentle stretching exercises and applying heat to the area. Would you like me to refer you to a physical therapist?

    Patient: Yes, I think that would be helpful.

    Physician: Great. I'll make that referral today. Take the medication as prescribed, and let's schedule a follow-up appointment in two weeks to see how you're progressing.

    Patient: Thank you doctor, I'm feeling more hopeful now.
    """

    # Initialize and run the pipeline
    pipeline = MedicalNLPPipeline()
    analysis = pipeline.process(sample_conversation)

    # Print the results
    print("\n===== CONVERSATION PARSING =====")
    print(f"Patient name: {analysis['conversation_data']['metadata']['patient_name']}")
    print(f"Number of physician statements: {len(analysis['conversation_data']['physician_statements'])}")
    print(f"Number of patient statements: {len(analysis['conversation_data']['patient_statements'])}")

    print("\n===== EXTRACTED ENTITIES =====")
    for category, entities in analysis['entities'].items():
        print(f"{category}: {', '.join(entities)}")

    print("\n===== MEDICAL REPORT =====")
    for key, value in analysis['report'].items():
        if isinstance(value, list):
            print(f"{key}: {', '.join(value)}")
        else:
            print(f"{key}: {value}")

    print("\n===== SENTIMENT ANALYSIS =====")
    print(f"Overall Sentiment: {analysis['sentiment_analysis']['Overall_Sentiment']}")
    print(f"Primary Intent: {analysis['sentiment_analysis']['Primary_Intent']}")
    print(f"Sentiment Progression: {analysis['sentiment_analysis']['Sentiment_Progression']}")

    print("\n===== SOAP NOTE =====")
    print("Subjective:")
    for key, value in analysis['soap_note']['Subjective'].items():
        print(f"  {key}: {value}")

    print("Objective:")
    for key, value in analysis['soap_note']['Objective'].items():
        print(f"  {key}: {value}")

    print("Assessment:")
    for key, value in analysis['soap_note']['Assessment'].items():
        print(f"  {key}: {value}")

    print("Plan:")
    for key, value in analysis['soap_note']['Plan'].items():
        print(f"  {key}: {value}")

    return analysis


def main():
    """
    Main function to demonstrate pipeline usage
    """
    print("Medical Transcription NLP Pipeline")
    print("==================================")

    # Test the pipeline
    analysis = test_pipeline()

    with open('analysis_results.json', 'w') as f:
      json.dump(analysis, f, indent=2)


if __name__ == "__main__":
    main()

Medical Transcription NLP Pipeline


Device set to use cpu



===== CONVERSATION PARSING =====
Patient name: Unknown Patient
Number of physician statements: 8
Number of patient statements: 7

===== EXTRACTED ENTITIES =====
DIAGNOSIS: re experiencing lumbar strain from the accident, you have some inflammation and tenderness in, this type of injury typically responds well, after that car accident
SYMPTOM: been having trouble sleeping because i can, ve been having trouble sleeping because i, i feel a tingling sensation down my, the tingling suggests some nerve, s a sharp pain in my lower, relaxants for the pain, you describe the pain, with the back pain
PROGNOSIS: been getting slightly better over the last, people see significant improve, most people see sign, until i feel better, re progress
TREATMENT: inflammatory medication and muscle relaxants, take the medication as prescribed, some gentle stretching exercise, well to conservative treatment, t think surgery is necessary, weeks with proper treatment, will i need surgery

===== MEDICAL REPORT ==

In [10]:
#Some sample conversations to test the application with

#conversation 1:
"""
  Physician: Good afternoon, Mr. Patel. How have your blood sugar levels been recently?
  Patient: Good afternoon, doctor. They've been mostly stable, but I had a few spikes last week.
  Physician: I see. Do you remember what might have triggered those spikes?
  Patient: I think it was after a couple of big meals. I attended a family gathering, and I had some sweets.
  Physician: That makes sense. Managing portion sizes and carbohydrate intake is key. Are you still taking your medications as prescribed?
  Patient: Yes, I take my metformin daily. I also check my blood sugar every morning before breakfast.
  Physician: That’s great. Have you experienced any symptoms like dizziness, excessive thirst, or fatigue?
  Patient: No, not really. I feel fine most of the time.
  Physician: That’s good to hear. Let’s review your latest blood test results. Your HbA1c is at 6.8%, which is slightly above target. I’d recommend adjusting your diet slightly and incorporating a bit more physical activity.
  Patient: Okay, I’ll try to be more mindful of that.
  Physician: Excellent. Let’s schedule a follow-up in three months to reassess your progress. In the meantime, if you notice any significant changes, please reach out.
"""

#conversation 2:
"""
  Physician: Hello, Mrs. Lopez. How is little Daniel doing today?
  Parent: Hi, doctor. He’s doing well, but he’s had a bit of a cough the past few days.
  Physician: I see. Has he had any fever or difficulty breathing?
  Parent: No fever, but he’s been a little more tired than usual.
  Physician: That’s important to note. Is he eating and drinking normally?
  Parent: Yes, he’s eating fine, but he doesn’t have much of an appetite in the mornings.
  Physician: That’s common with mild respiratory infections. Let me listen to his lungs.
  [Physical Examination Conducted]
  Physician: His lungs sound clear, and there are no signs of anything serious. It looks like a mild viral infection. Make sure he gets plenty of fluids and rest.
  Parent: That’s a relief. Should I give him any medicine?
  Physician: If his cough worsens or he develops a fever, you can give him children's paracetamol. Otherwise, warm fluids and a humidifier at night can help.
  Parent: Got it. Thank you, doctor.
  Physician: You’re welcome! If the cough persists for more than ten days or gets worse, come back for a re-evaluation.
"""

#conversation 3:
"""
    Physician: Good morning, Mr. Carter. How are you feeling since your knee surgery?
    Patient: Good morning, doctor. It’s been a bit sore, but I think I’m healing well.
    Physician: That’s normal. Are you still using crutches?
    Patient: Yes, but only when I need to walk long distances. Around the house, I manage without them.
    Physician: That’s good progress. Have you noticed any swelling or unusual pain?
    Patient: The swelling has gone down a lot, and the pain is manageable with the medication.
    Physician: Excellent. Have you started physiotherapy?
    Patient: Yes, I had my first session last week. It was tough, but I can already feel an improvement in my movement.
    Physician: That’s great to hear. Keep up with the exercises—they’re key to your full recovery. I expect you’ll be able to walk normally without crutches in about three more weeks.
    Patient: That’s good news! Anything else I should watch for?
    Physician: Just be mindful of any redness, warmth, or increased pain around the incision site—those could be signs of infection. If that happens, let me know immediately.
    Patient: Will do. Thanks, doctor.
    Physician: You’re very welcome, Mr. Carter. Keep up the good work!
"""

#just paste these in place of the sample conversation to test the application

#also check out the json file created by the app

'\n    Physician: Good morning, Mr. Carter. How are you feeling since your knee surgery?\n    Patient: Good morning, doctor. It’s been a bit sore, but I think I’m healing well.\n    Physician: That’s normal. Are you still using crutches?\n    Patient: Yes, but only when I need to walk long distances. Around the house, I manage without them.\n    Physician: That’s good progress. Have you noticed any swelling or unusual pain?\n    Patient: The swelling has gone down a lot, and the pain is manageable with the medication.\n    Physician: Excellent. Have you started physiotherapy?\n    Patient: Yes, I had my first session last week. It was tough, but I can already feel an improvement in my movement.\n    Physician: That’s great to hear. Keep up with the exercises—they’re key to your full recovery. I expect you’ll be able to walk normally without crutches in about three more weeks.\n    Patient: That’s good news! Anything else I should watch for?\n    Physician: Just be mindful of any rednes