# Medical Symptom Extraction with BioGPT

Simple symptom extraction using Microsoft's BioGPT model. Takes patient messages and identifies symptoms with confidence scores.

## 1. Import Libraries

In [14]:
import torch
from transformers import BioGptTokenizer, BioGptForCausalLM
import re
import json
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


## 2. Load BioGPT Model

In [15]:
# Load BioGPT model
print("Loading BioGPT model...")
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
model.to(device)
model.eval()
print("Model loaded successfully!")

Loading BioGPT model...
Model loaded successfully!


## 3. Symptom Extraction Function

In [16]:
def extract_symptoms(patient_message: str, 
                     num_beams: int = 3,
                     max_new_tokens: int = 50) -> Dict:
    """
    Extract symptoms from patient message using BioGPT.
    
    Args:
        patient_message: The patient's description of their condition
        num_beams: Number of beams for beam search
        max_new_tokens: Maximum number of new tokens to generate
    
    Returns:
        Dictionary containing extracted symptoms with confidence scores
    """
    
    # BioGPT works better with medical/clinical style prompts
    # Using a format similar to medical literature
    prompt = f"The patient presents with {patient_message}. The clinical symptoms include"
    
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate response with BioGPT
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,  # Use max_new_tokens instead of max_length
            num_beams=num_beams,
            temperature=0.8,
            do_sample=True,  # Enable sampling for better diversity
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            output_scores=True,
            return_dict_in_generate=True
        )
    
    # Decode the generated text
    generated_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
    
    # Extract only the generated part (after the prompt)
    if prompt in generated_text:
        generated_part = generated_text.split(prompt)[1].strip()
    else:
        # Fallback if exact prompt not found
        generated_part = generated_text[len(prompt):].strip()
    
    # Parse symptoms from generated text
    symptoms = parse_biogpt_output(generated_part)
    
    # Calculate confidence scores
    symptoms_with_confidence = calculate_confidence_scores(symptoms, outputs)
    
    return {
        "original_message": patient_message,
        "biogpt_output": generated_part,
        "extracted_symptoms": symptoms_with_confidence
    }

In [17]:
def parse_biogpt_output(generated_text: str) -> List[str]:
    """
    Parse symptoms from BioGPT generated text.
    
    Args:
        generated_text: Text generated by BioGPT
    
    Returns:
        List of identified symptoms
    """
    symptoms = []
    
    if not generated_text:
        return symptoms
    
    # Clean and split the text
    text = generated_text.strip()
    
    # Try to parse numbered list (1. symptom, 2. symptom)
    numbered_pattern = r'\d+\.\s*([^,\.;]+)[,\.;]?'
    numbered_matches = re.findall(numbered_pattern, text)
    
    if numbered_matches:
        symptoms.extend([match.strip() for match in numbered_matches if match.strip()])
    
    # Try comma, semicolon, or 'and' separated
    if not symptoms:
        # Remove periods at the end
        text = text.rstrip('.')
        
        # Split by various delimiters
        if ',' in text:
            parts = text.split(',')
        elif ' and ' in text:
            parts = text.split(' and ')
        elif ';' in text:
            parts = text.split(';')
        else:
            # Take the whole text if it's short enough to be a symptom
            parts = [text] if len(text) < 50 else []
        
        for part in parts:
            cleaned = part.strip().rstrip('.,;:')
            if cleaned and len(cleaned) > 2:
                symptoms.append(cleaned)
    
    # Clean up symptoms - remove any that are too long or contain full sentences
    final_symptoms = []
    for symptom in symptoms:
        # Skip if it looks like a full sentence
        if len(symptom.split()) <= 5 and not symptom.endswith('.'):
            final_symptoms.append(symptom.lower())
    
    return final_symptoms

In [18]:
def calculate_confidence_scores(symptoms: List[str], outputs) -> List[Dict]:
    """
    Calculate confidence scores for extracted symptoms based on model outputs.
    
    Args:
        symptoms: List of extracted symptoms
        outputs: Model generation outputs
    
    Returns:
        List of dictionaries with symptoms and confidence scores
    """
    symptoms_with_confidence = []
    
    # Get the sequence score from beam search as base confidence
    if hasattr(outputs, 'sequences_scores') and outputs.sequences_scores is not None:
        # Convert log probability to probability
        base_confidence = torch.exp(outputs.sequences_scores[0]).item()
    else:
        base_confidence = 0.75  # Default confidence if scores not available
    
    for i, symptom in enumerate(symptoms):
        # Slightly vary confidence based on order (earlier = higher confidence)
        position_factor = 1.0 - (i * 0.05)  # Decrease by 5% for each position
        position_factor = max(0.7, position_factor)  # Minimum 70% of base
        
        confidence = base_confidence * position_factor
        
        # Ensure confidence is in valid range [0.1, 0.99]
        confidence = max(0.1, min(0.99, confidence))
        
        symptoms_with_confidence.append({
            "symptom": symptom,
            "confidence": round(confidence, 3)
        })
    
    return symptoms_with_confidence

## 4. Initial Examples

In [19]:
# Test the problem case you mentioned
test_problem = "My throat hurts so badly"

result_problem = extract_symptoms(test_problem)
print("Test Case - Throat Pain:")
print(f"Input: {result_problem['original_message']}")
print(f"\nBioGPT Output: '{result_problem['biogpt_output']}'")
print(f"\nExtracted Symptoms:")
for symptom_data in result_problem['extracted_symptoms']:
    print(f"  - {symptom_data['symptom']} (confidence: {symptom_data['confidence']:.1%})")
print("\n" + "="*50 + "\n")

Test Case - Throat Pain:
Input: My throat hurts so badly

BioGPT Output: 'throat pain, sore throat, dysphagia, and odynophagia.'

Extracted Symptoms:
  - throat pain (confidence: 33.1%)
  - sore throat (confidence: 31.5%)
  - dysphagia (confidence: 29.8%)
  - and odynophagia (confidence: 28.2%)




In [20]:
# Example 1: Simple extraction
test_message1 = "My throat is sore and I've had a fever for two days."

result1 = extract_symptoms(test_message1)
print("Example 1:")
print(f"Input: {result1['original_message']}")
print(f"\nBioGPT Output: '{result1['biogpt_output']}'")
print(f"\nExtracted Symptoms:")
for symptom_data in result1['extracted_symptoms']:
    print(f"  - {symptom_data['symptom']} (confidence: {symptom_data['confidence']:.1%})")
print("\n" + "="*50 + "\n")

Example 1:
Input: My throat is sore and I've had a fever for two days.

BioGPT Output: 'sore throat, odynophagia and dysphagia.'

Extracted Symptoms:
  - sore throat (confidence: 32.0%)
  - odynophagia and dysphagia (confidence: 30.4%)




## 5. Comprehensive Testing Framework

This section tests the model with 20 single-symptom descriptions to evaluate its accuracy and performance.

In [21]:
def evaluate_response(extracted_symptoms: List[Dict], ideal_response: str, 
                     acceptable_alternatives: List[str] = None) -> Tuple[bool, str]:
    """
    Evaluate if the extracted symptoms match the ideal response.
    
    Args:
        extracted_symptoms: List of extracted symptoms with confidence scores
        ideal_response: The ideal/expected symptom
        acceptable_alternatives: List of acceptable alternative terms
    
    Returns:
        Tuple of (is_correct, reason)
    """
    if not extracted_symptoms:
        return False, "No symptoms extracted"
    
    # Get all extracted symptom texts
    extracted_texts = [s['symptom'].lower() for s in extracted_symptoms]
    
    # Check ideal response
    ideal_lower = ideal_response.lower()
    
    # Direct match
    if ideal_lower in extracted_texts:
        return True, f"Exact match: '{ideal_response}'"
    
    # Check if ideal response is contained in any extracted symptom
    for symptom in extracted_texts:
        if ideal_lower in symptom or symptom in ideal_lower:
            return True, f"Partial match: '{symptom}' contains/contained in '{ideal_response}'"
    
    # Check acceptable alternatives
    if acceptable_alternatives:
        for alt in acceptable_alternatives:
            alt_lower = alt.lower()
            if alt_lower in extracted_texts:
                return True, f"Acceptable alternative: '{alt}'"
            for symptom in extracted_texts:
                if alt_lower in symptom or symptom in alt_lower:
                    return True, f"Acceptable alternative partial match: '{symptom}' ~ '{alt}'"
    
    # Check for semantic similarity (basic keyword matching)
    ideal_keywords = set(ideal_lower.split())
    for symptom in extracted_texts:
        symptom_keywords = set(symptom.split())
        if len(ideal_keywords & symptom_keywords) >= len(ideal_keywords) * 0.5:
            return True, f"Keyword match: '{symptom}' ~ '{ideal_response}'"
    
    return False, f"No match. Expected: '{ideal_response}', Got: {extracted_texts[:3]}"

### Define 20 Test Cases

In [22]:
# Define 20 test cases with single-symptom descriptions
test_cases = [
    {
        "input": "I have been pooping a lot, and it's very watery",
        "ideal": "diarrhea",
        "alternatives": ["loose stools", "watery stools", "frequent bowel movements"]
    },
    {
        "input": "My head is pounding and throbbing with pain",
        "ideal": "headache",
        "alternatives": ["cephalgia", "head pain", "migraine"]
    },
    {
        "input": "I feel like I'm going to throw up",
        "ideal": "nausea",
        "alternatives": ["queasiness", "feeling sick", "upset stomach"]
    },
    {
        "input": "My nose won't stop running and dripping",
        "ideal": "runny nose",
        "alternatives": ["rhinorrhea", "nasal discharge", "nasal congestion"]
    },
    {
        "input": "I can't stop coughing, it's dry and harsh",
        "ideal": "cough",
        "alternatives": ["dry cough", "persistent cough", "hacking cough"]
    },
    {
        "input": "My body temperature is really high and I feel hot",
        "ideal": "fever",
        "alternatives": ["pyrexia", "high temperature", "hyperthermia"]
    },
    {
        "input": "I'm having trouble breathing and feel short of breath",
        "ideal": "dyspnea",
        "alternatives": ["shortness of breath", "breathing difficulty", "respiratory distress"]
    },
    {
        "input": "My chest feels tight and painful when I breathe",
        "ideal": "chest pain",
        "alternatives": ["chest tightness", "chest discomfort", "thoracic pain"]
    },
    {
        "input": "I feel extremely tired and have no energy",
        "ideal": "fatigue",
        "alternatives": ["exhaustion", "tiredness", "lethargy", "weakness"]
    },
    {
        "input": "My stomach hurts and feels crampy",
        "ideal": "abdominal pain",
        "alternatives": ["stomach pain", "stomach cramps", "belly pain"]
    },
    {
        "input": "I feel dizzy and the room is spinning",
        "ideal": "vertigo",
        "alternatives": ["dizziness", "lightheadedness", "spinning sensation"]
    },
    {
        "input": "My joints are swollen and painful",
        "ideal": "arthralgia",
        "alternatives": ["joint pain", "joint swelling", "arthritis"]
    },
    {
        "input": "I have red, itchy bumps all over my skin",
        "ideal": "rash",
        "alternatives": ["skin rash", "hives", "urticaria", "skin eruption"]
    },
    {
        "input": "My throat feels scratchy and painful when swallowing",
        "ideal": "sore throat",
        "alternatives": ["pharyngitis", "throat pain", "dysphagia"]
    },
    {
        "input": "I've been throwing up multiple times today",
        "ideal": "vomiting",
        "alternatives": ["emesis", "throwing up", "being sick"]
    },
    {
        "input": "My muscles ache all over my body",
        "ideal": "myalgia",
        "alternatives": ["muscle pain", "muscle aches", "body aches"]
    },
    {
        "input": "I can't sleep and stay awake all night",
        "ideal": "insomnia",
        "alternatives": ["sleeplessness", "sleep difficulty", "inability to sleep"]
    },
    {
        "input": "My eyes are red and watery with discharge",
        "ideal": "conjunctivitis",
        "alternatives": ["pink eye", "eye infection", "red eyes", "eye discharge"]
    },
    {
        "input": "I keep sneezing repeatedly and can't stop",
        "ideal": "sneezing",
        "alternatives": ["frequent sneezing", "sneeze", "nasal irritation"]
    },
    {
        "input": "My lower back is in severe pain",
        "ideal": "back pain",
        "alternatives": ["lower back pain", "lumbago", "backache", "lumbar pain"]
    }
]

print(f"Loaded {len(test_cases)} test cases for evaluation.")

Loaded 20 test cases for evaluation.


### Run Comprehensive Tests

In [23]:
def run_comprehensive_tests():
    """
    Run all test cases and calculate performance metrics.
    """
    print("="*80)
    print("COMPREHENSIVE TESTING OF BIOGPT SYMPTOM EXTRACTION")
    print("="*80)
    print(f"\nRunning {len(test_cases)} test cases...\n")
    
    results = []
    correct_count = 0
    
    for i, test_case in enumerate(test_cases, 1):
        print(f"\nTest Case {i}/{len(test_cases)}")
        print("-" * 40)
        print(f"Input: \"{test_case['input']}\"")
        print(f"Expected: {test_case['ideal']}")
        
        # Run extraction
        result = extract_symptoms(test_case['input'])
        
        # Evaluate
        is_correct, reason = evaluate_response(
            result['extracted_symptoms'], 
            test_case['ideal'],
            test_case.get('alternatives', [])
        )
        
        if is_correct:
            correct_count += 1
            status = "‚úì PASS"
        else:
            status = "‚úó FAIL"
        
        print(f"BioGPT Output: \"{result['biogpt_output']}\"")
        extracted_list = [s['symptom'] for s in result['extracted_symptoms'][:3]]
        print(f"Extracted Symptoms: {extracted_list}")
        print(f"Result: {status} - {reason}")
        
        results.append({
            "test_case": i,
            "input": test_case['input'],
            "expected": test_case['ideal'],
            "biogpt_output": result['biogpt_output'],
            "extracted": [s['symptom'] for s in result['extracted_symptoms']],
            "is_correct": is_correct,
            "reason": reason
        })
    
    return results, correct_count

In [24]:
# Run the tests
results, correct_count = run_comprehensive_tests()

COMPREHENSIVE TESTING OF BIOGPT SYMPTOM EXTRACTION

Running 20 test cases...


Test Case 1/20
----------------------------------------
Input: "I have been pooping a lot, and it's very watery"
Expected: diarrhea


BioGPT Output: "watery diarrhea, abdominal pain, nausea, vomiting, and weight loss."
Extracted Symptoms: ['watery diarrhea', 'abdominal pain', 'nausea']
Result: ‚úì PASS - Partial match: 'watery diarrhea' contains/contained in 'diarrhea'

Test Case 2/20
----------------------------------------
Input: "My head is pounding and throbbing with pain"
Expected: headache
BioGPT Output: "headache, nausea, vomiting, dizziness, and blurred vision."
Extracted Symptoms: ['headache', 'nausea', 'vomiting']
Result: ‚úì PASS - Exact match: 'headache'

Test Case 3/20
----------------------------------------
Input: "I feel like I'm going to throw up"
Expected: nausea
BioGPT Output: ": a feeling of fullness in the abdomen, a feeling of fullness in the right upper quadrant of the abdomen, a feeling of fullness in the left upper quadrant of the abdomen, and a feeling of fullness in the right lower quadrant of"
Extracted Symptoms: []
Result: ‚úó FAIL - No symptoms extracted

Test Case 4/20
-----------------

### Performance Metrics and Analysis

In [25]:
def display_performance_metrics(results, correct_count):
    """
    Display detailed performance metrics and analysis.
    """
    print("\n" + "="*80)
    print("PERFORMANCE METRICS")
    print("="*80)
    
    total_tests = len(test_cases)
    accuracy = (correct_count / total_tests) * 100
    
    print(f"\nTotal Test Cases: {total_tests}")
    print(f"Correct Predictions: {correct_count}")
    print(f"Incorrect Predictions: {total_tests - correct_count}")
    print(f"\nAccuracy: {accuracy:.1f}%")
    
    # Performance grade
    print("\nPerformance Grade:")
    if accuracy >= 90:
        grade = "Excellent"
    elif accuracy >= 75:
        grade = "Good"
    elif accuracy >= 60:
        grade = "Acceptable"
    elif accuracy >= 50:
        grade = "Needs Improvement"
    else:
        grade = "Poor"
    
    print(f" {grade} ({accuracy:.1f}%)")
    
    # Show failed cases
    failed_cases = [r for r in results if not r['is_correct']]
    if failed_cases:
        print(f"\nFailed Cases ({len(failed_cases)}):")
        print("-" * 40)
        for case in failed_cases:
            print(f"\nCase {case['test_case']}:")
            print(f"  Input: \"{case['input']}\"")
            print(f"  Expected: '{case['expected']}'")
            print(f"  Got: {case['extracted'][:2]}")
            print(f"  Reason: {case['reason']}")
    
    return accuracy

# Display the metrics
accuracy = display_performance_metrics(results, correct_count)


PERFORMANCE METRICS

Total Test Cases: 20
Correct Predictions: 15
Incorrect Predictions: 5

Accuracy: 75.0%

Performance Grade:
 Good (75.0%)

Failed Cases (5):
----------------------------------------

Case 3:
  Input: "I feel like I'm going to throw up"
  Expected: 'nausea'
  Got: []
  Reason: No symptoms extracted

Case 5:
  Input: "I can't stop coughing, it's dry and harsh"
  Expected: 'cough'
  Got: ['shortness of breath', 'chest pain']
  Reason: No match. Expected: 'cough', Got: ['shortness of breath', 'chest pain', 'palpitations']

Case 13:
  Input: "I have red, itchy bumps all over my skin"
  Expected: 'rash'
  Got: ['itching', 'burning']
  Reason: No match. Expected: 'rash', Got: ['itching', 'burning']

Case 15:
  Input: "I've been throwing up multiple times today"
  Expected: 'vomiting'
  Got: ['pain', 'swelling']
  Reason: No match. Expected: 'vomiting', Got: ['pain', 'swelling']

Case 18:
  Input: "My eyes are red and watery with discharge"
  Expected: 'conjunctivitis'
  G

### Save Results to File

In [26]:
# Save detailed results to JSON file
import json
from datetime import datetime

# Prepare results for saving
save_data = {
    "test_date": datetime.now().isoformat(),
    "model": "microsoft/biogpt",
    "accuracy": accuracy,
    "correct_count": correct_count,
    "total_tests": len(test_cases),
    "grade": "Excellent" if accuracy >= 90 else "Good" if accuracy >= 75 else "Acceptable" if accuracy >= 60 else "Needs Improvement" if accuracy >= 50 else "Poor",
    "detailed_results": results
}

# Save to file
with open('symptom_extraction_test_results.json', 'w') as f:
    json.dump(save_data, f, indent=2)

print(f"\nüìÅ Detailed results saved to 'symptom_extraction_test_results.json'")
print(f"\nüéØ Final Accuracy: {accuracy:.1f}%")


üìÅ Detailed results saved to 'symptom_extraction_test_results.json'

üéØ Final Accuracy: 75.0%
