# Test Your Fine-Tuned Model - Single Symptom Evaluation

This notebook tests your trained model by comparing ONLY the #1 highest predicted symptom against the expected result.

---

## 1. Load Your Trained Model

In [1]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

# Load your fine-tuned model
print("üîÑ Loading your fine-tuned model...")
model = SentenceTransformer('models/medical_symptom_matcher')
print("‚úÖ Model loaded successfully!")

  from .autonotebook import tqdm as notebook_tqdm



üîÑ Loading your fine-tuned model...
‚úÖ Model loaded successfully!


## 2. Load Symptoms Database

In [2]:
# Load the canonical symptoms
print("üìã Loading symptoms database...")
symptoms_df = pd.read_csv('data/symptoms.csv')
canonical_symptoms = symptoms_df['symptoms'].tolist()
print(f"‚úÖ Loaded {len(canonical_symptoms)} symptoms")

# Show first few symptoms
print("\nFirst 10 symptoms:")
for i, symptom in enumerate(canonical_symptoms[:10], 1):
    print(f"  {i}. {symptom}")

üìã Loading symptoms database...
‚úÖ Loaded 380 symptoms

First 10 symptoms:
  1. anxiety and nervousness
  2. depression
  3. shortness of breath
  4. depressive or psychotic symptoms
  5. sharp chest pain
  6. dizziness
  7. insomnia
  8. abnormal involuntary movements
  9. chest tightness
  10. palpitations


## 3. Pre-compute Symptom Embeddings

In [3]:
# Compute embeddings once (saves time when testing multiple inputs)
print("üßÆ Computing symptom embeddings...")
symptom_embeddings = model.encode(canonical_symptoms, convert_to_tensor=True)
print(f"‚úÖ Created embeddings with shape: {symptom_embeddings.shape}")

üßÆ Computing symptom embeddings...
‚úÖ Created embeddings with shape: torch.Size([380, 384])


## 4. Define Single Symptom Extraction Function

In [4]:
def extract_top_symptom(patient_input):
    """
    Extract ONLY the #1 highest confidence symptom from patient's description.
    
    Args:
        patient_input (str): What the patient says
    
    Returns:
        dict: The single highest confidence symptom
    """
    # Encode the patient's input
    input_embedding = model.encode(patient_input, convert_to_tensor=True)
    
    # Calculate similarity with all symptoms
    similarities = util.cos_sim(input_embedding, symptom_embeddings)[0]
    
    # Get the highest scoring symptom
    max_score_idx = similarities.argmax().item()
    max_score = float(similarities[max_score_idx])
    
    return {
        'symptom': canonical_symptoms[max_score_idx],
        'confidence': max_score,
        'percentage': f"{max_score*100:.1f}%"
    }

def check_match(predicted, expected):
    """
    Check if predicted symptom matches expected, allowing for variations.
    
    Returns:
        Tuple of (is_match, reason)
    """
    predicted_lower = predicted.lower().strip()
    expected_lower = expected.lower().strip()
    
    # Exact match
    if predicted_lower == expected_lower:
        return True, f"Exact match: '{predicted}'"
    
    # Partial match (one contains the other)
    if expected_lower in predicted_lower or predicted_lower in expected_lower:
        return True, f"Partial match: '{predicted}' contains/contained in '{expected}'"
    
    # Check for common synonyms and variations
    synonym_map = {
        "diarrhea": ["watery diarrhea", "loose stools", "watery stools", "diarrhoea"],
        "headache": ["head pain", "cephalgia", "migraine", "severe headache"],
        "nausea": ["vomiting", "emesis", "feeling sick", "queasiness"],
        "fever": ["pyrexia", "high temperature", "elevated temperature", "hyperthermia"],
        "cough": ["coughing", "tussis", "persistent cough"],
        "abdominal pain": ["stomach pain", "belly pain", "gastric pain", "stomachache", "sharp abdominal pain"],
        "dizziness": ["vertigo", "lightheadedness", "giddiness"],
        "rash": ["skin rash", "eruption", "exanthem", "skin eruption"],
        "fatigue": ["tiredness", "exhaustion", "weariness", "lethargy"],
        "sore throat": ["pharyngitis", "throat pain", "painful throat"],
        "shortness of breath": ["dyspnea", "breathing difficulty", "respiratory distress", "difficulty breathing"],
        "insomnia": ["sleeplessness", "sleep disturbance", "difficulty sleeping"],
        "joint pain": ["arthralgia", "joint aches", "painful joints"],
        "rhinorrhea": ["runny nose", "nasal discharge", "nasal congestion"],
        "vomiting": ["regurgitation", "throwing up", "emesis", "nausea"],
        "myalgia": ["muscle pain", "muscle aches", "ache all over"],
        "blurred vision": ["vision changes", "visual disturbance", "cloudy vision"],
        "back pain": ["backache", "lumbar pain", "lower back pain"],
        "anxiety": ["anxiety and nervousness", "nervousness", "panic", "worry"]
    }
    
    # Check synonyms
    for key, synonyms in synonym_map.items():
        if expected_lower == key.lower() or expected_lower in [s.lower() for s in synonyms]:
            if predicted_lower == key.lower() or predicted_lower in [s.lower() for s in synonyms]:
                return True, f"Synonym match: '{predicted}' ‚âà '{expected}'"
    
    return False, f"No match. Expected: '{expected}', Got: '{predicted}'"

print("‚úÖ Functions defined!")

‚úÖ Functions defined!


In [22]:
extract_top_symptom("My skin is itching like crazy")

{'symptom': 'itching of skin',
 'confidence': 0.9866976737976074,
 'percentage': '98.7%'}

## 5. Define Test Cases (Same as BioGPT Tests)

In [45]:
# Same test cases from the BioGPT model
test_cases = [
    {"input": "I have been feeling really tired recently", "expected": "fatigue"},
    {"input": "My head is pounding and throbbing with pain", "expected": "headache"},
    {"input": "I feel like I'm going to throw up", "expected": "nausea"},
    {"input": "My ankle feels swollen", "expected": "ankle swelling"},
    {"input": "i have been coughing since last night", "expected": "cough"},
    {"input": "My stomach hurts really bad", "expected": "abdominal pain"},
    {"input": "I feel dizzy and the room is spinning", "expected": "dizziness"},
    {"input": "I have red bumps all over my skin", "expected": "rash"},
    {"input": "I feel extremely tired and have no energy", "expected": "fatigue"},
    {"input": "My throat is really sore when I swallow", "expected": "sore throat"},
    {"input": "My chest feels tight and it's hard to breathe", "expected": "shortness of breath"},
    {"input": "I've been having trouble sleeping", "expected": "insomnia"},
    {"input": "My joints are swollen and painful", "expected": "joint pain"},
    {"input": "i feel so weak", "expected": "weakness"},
    {"input": "I've been throwing up multiple times today", "expected": "vomiting"},
    {"input": "My muscles ache all over", "expected": "myalgia"},
    {"input": "I can't sleep and stay awake all night", "expected": "insomnia"},
    {"input": "i have lost 20kgs in the past few months", "expected": "recent weight loss"},
    {"input": "I have sharp pain in my lower back", "expected": "back pain"},
    {"input": "my skin is dry and peeling from the weather", "expected": "skin peeling"}
]

print(f"Loaded {len(test_cases)} test cases")

Loaded 20 test cases


In [None]:
extract_top_symptom()"i feel so weak"

{'symptom': 'weakness',
 'confidence': 0.9621926546096802,
 'percentage': '96.2%'}

## 6. Run Single Symptom Tests

In [46]:
def run_single_symptom_tests():
    """
    Run all test cases using ONLY the #1 predicted symptom.
    """
    print("="*80)
    print("TESTING SINGLE SYMPTOM EXTRACTION (#1 Prediction Only)")
    print("="*80)
    print(f"\nRunning {len(test_cases)} test cases...\n")
    
    results = []
    correct_count = 0
    
    for i, test_case in enumerate(test_cases, 1):
        print(f"\nTest Case {i}/{len(test_cases)}")
        print("-" * 40)
        print(f"Input: \"{test_case['input']}\"")
        print(f"Expected: {test_case['expected']}")
        
        # Extract only the top symptom
        result = extract_top_symptom(test_case['input'])
        
        print(f"#1 Prediction: {result['symptom']}")
        print(f"Confidence: {result['percentage']}")
        
        # Check if correct
        is_correct, reason = check_match(result['symptom'], test_case['expected'])
        
        if is_correct:
            print(f"Result: ‚úì PASS - {reason}")
            correct_count += 1
        else:
            print(f"Result: ‚úó FAIL - {reason}")
        
        results.append({
            "test_case": i,
            "input": test_case['input'],
            "expected": test_case['expected'],
            "predicted": result['symptom'],
            "confidence": result['confidence'],
            "is_correct": is_correct,
            "reason": reason
        })
    
    return results, correct_count

# Run the tests
results, correct_count = run_single_symptom_tests()

TESTING SINGLE SYMPTOM EXTRACTION (#1 Prediction Only)

Running 20 test cases...


Test Case 1/20
----------------------------------------
Input: "I have been feeling really tired recently"
Expected: fatigue
#1 Prediction: fatigue
Confidence: 95.9%
Result: ‚úì PASS - Exact match: 'fatigue'

Test Case 2/20
----------------------------------------
Input: "My head is pounding and throbbing with pain"
Expected: headache
#1 Prediction: neck pain
Confidence: 90.3%
Result: ‚úó FAIL - No match. Expected: 'headache', Got: 'neck pain'

Test Case 3/20
----------------------------------------
Input: "I feel like I'm going to throw up"
Expected: nausea
#1 Prediction: nausea
Confidence: 85.3%
Result: ‚úì PASS - Exact match: 'nausea'

Test Case 4/20
----------------------------------------
Input: "My ankle feels swollen"
Expected: ankle swelling
#1 Prediction: ankle swelling
Confidence: 98.1%
Result: ‚úì PASS - Exact match: 'ankle swelling'

Test Case 5/20
----------------------------------------
Inp

## 7. Performance Summary

In [47]:
def display_performance_summary(results, correct_count):
    """
    Display detailed performance summary.
    """
    print("\n" + "="*80)
    print("PERFORMANCE SUMMARY (Single Symptom Prediction)")
    print("="*80)
    
    total_tests = len(test_cases)
    accuracy = (correct_count / total_tests) * 100
    
    print(f"\nTotal Test Cases: {total_tests}")
    print(f"Correct Predictions: {correct_count}")
    print(f"Incorrect Predictions: {total_tests - correct_count}")
    print(f"\nüéØ Accuracy: {accuracy:.1f}%")
    
    # Performance grade
    print("\nPerformance Grade:")
    if accuracy >= 90:
        grade = "Excellent"
        emoji = "üåü"
    elif accuracy >= 75:
        grade = "Good"
        emoji = "‚ú®"
    elif accuracy >= 60:
        grade = "Acceptable"
        emoji = "üëç"
    elif accuracy >= 50:
        grade = "Needs Improvement"
        emoji = "üìà"
    else:
        grade = "Poor"
        emoji = "‚ö†Ô∏è"
    
    print(f"{emoji} {grade} ({accuracy:.1f}%)")
    
    # Show failed cases
    failed_cases = [r for r in results if not r['is_correct']]
    if failed_cases:
        print(f"\n‚ùå Failed Cases ({len(failed_cases)}):")
        print("-" * 40)
        for case in failed_cases:
            print(f"\nCase {case['test_case']}:")
            print(f"  Input: \"{case['input']}\"")
            print(f"  Expected: '{case['expected']}'")
            print(f"  Got: '{case['predicted']}'")
            print(f"  Confidence: {case['confidence']:.3f}")
    
    # Show successful cases summary
    successful_cases = [r for r in results if r['is_correct']]
    if successful_cases:
        print(f"\n‚úÖ Successful Predictions: {len(successful_cases)}/{total_tests}")
        avg_confidence = sum(r['confidence'] for r in successful_cases) / len(successful_cases)
        print(f"   Average confidence for correct predictions: {avg_confidence:.1%}")
    
    return accuracy

# Display the summary
accuracy = display_performance_summary(results, correct_count)


PERFORMANCE SUMMARY (Single Symptom Prediction)

Total Test Cases: 20
Correct Predictions: 12
Incorrect Predictions: 8

üéØ Accuracy: 60.0%

Performance Grade:
üëç Acceptable (60.0%)

‚ùå Failed Cases (8):
----------------------------------------

Case 2:
  Input: "My head is pounding and throbbing with pain"
  Expected: 'headache'
  Got: 'neck pain'
  Confidence: 0.903

Case 8:
  Input: "I have red bumps all over my skin"
  Expected: 'rash'
  Got: 'skin irritation'
  Confidence: 0.949

Case 10:
  Input: "My throat is really sore when I swallow"
  Expected: 'sore throat'
  Got: 'throat irritation'
  Confidence: 0.946

Case 12:
  Input: "I've been having trouble sleeping"
  Expected: 'insomnia'
  Got: 'sleepiness'
  Confidence: 0.929

Case 13:
  Input: "My joints are swollen and painful"
  Expected: 'joint pain'
  Got: 'joint swelling'
  Confidence: 0.976

Case 15:
  Input: "I've been throwing up multiple times today"
  Expected: 'vomiting'
  Got: 'diarrhea'
  Confidence: 0.721

Case

## 8. Detailed Results Analysis

In [8]:
# Create a DataFrame for easier analysis
import pandas as pd

results_df = pd.DataFrame(results)

# Show all results in a table
print("\nDETAILED RESULTS TABLE:")
print("="*80)

# Create a summary table
summary_df = results_df[['test_case', 'expected', 'predicted', 'confidence', 'is_correct']].copy()
summary_df['confidence'] = summary_df['confidence'].apply(lambda x: f"{x:.1%}")
summary_df['match'] = summary_df['is_correct'].apply(lambda x: '‚úì' if x else '‚úó')
summary_df = summary_df.drop('is_correct', axis=1)

print(summary_df.to_string(index=False))


DETAILED RESULTS TABLE:
 test_case            expected            predicted confidence match
         1            diarrhea         constipation      80.7%     ‚úó
         2            headache            neck pain      90.3%     ‚úó
         3              nausea               nausea      85.3%     ‚úì
         4               fever          feeling hot      88.5%     ‚úó
         5               cough             wheezing      87.0%     ‚úó
         6      abdominal pain lower abdominal pain      86.9%     ‚úì
         7           dizziness            dizziness      90.3%     ‚úì
         8                rash      skin irritation      94.9%     ‚úó
         9             fatigue              fatigue      92.9%     ‚úì
        10         sore throat    throat irritation      94.6%     ‚úó
        11 shortness of breath difficulty breathing      93.4%     ‚úì
        12            insomnia           sleepiness      92.9%     ‚úó
        13          joint pain       joint swelling   

## 9. Confidence Distribution Analysis

In [9]:
# Analyze confidence scores
print("\nCONFIDENCE SCORE ANALYSIS:")
print("="*80)

all_confidences = [r['confidence'] for r in results]
correct_confidences = [r['confidence'] for r in results if r['is_correct']]
incorrect_confidences = [r['confidence'] for r in results if not r['is_correct']]

print(f"\nOverall Statistics:")
print(f"  Mean confidence: {np.mean(all_confidences):.1%}")
print(f"  Min confidence: {np.min(all_confidences):.1%}")
print(f"  Max confidence: {np.max(all_confidences):.1%}")

if correct_confidences:
    print(f"\nCorrect Predictions:")
    print(f"  Mean confidence: {np.mean(correct_confidences):.1%}")
    print(f"  Min confidence: {np.min(correct_confidences):.1%}")
    print(f"  Max confidence: {np.max(correct_confidences):.1%}")

if incorrect_confidences:
    print(f"\nIncorrect Predictions:")
    print(f"  Mean confidence: {np.mean(incorrect_confidences):.1%}")
    print(f"  Min confidence: {np.min(incorrect_confidences):.1%}")
    print(f"  Max confidence: {np.max(incorrect_confidences):.1%}")


CONFIDENCE SCORE ANALYSIS:

Overall Statistics:
  Mean confidence: 90.6%
  Min confidence: 72.1%
  Max confidence: 97.6%

Correct Predictions:
  Mean confidence: 91.0%
  Min confidence: 85.3%
  Max confidence: 97.5%

Incorrect Predictions:
  Mean confidence: 90.4%
  Min confidence: 72.1%
  Max confidence: 97.6%


## 10. Save Results

In [10]:
# Save results to JSON
import json
from datetime import datetime

save_data = {
    "test_date": datetime.now().isoformat(),
    "model": "medical_symptom_matcher",
    "mode": "single_symptom_prediction",
    "accuracy": accuracy,
    "correct_count": correct_count,
    "total_tests": len(test_cases),
    "detailed_results": results
}

# Save to file
with open('single_symptom_model_test_results.json', 'w') as f:
    json.dump(save_data, f, indent=2)

print(f"\nüìÅ Results saved to 'single_symptom_model_test_results.json'")
print(f"\nüéØ Final Accuracy: {accuracy:.1f}%")


üìÅ Results saved to 'single_symptom_model_test_results.json'

üéØ Final Accuracy: 30.0%


## 11. Interactive Testing (Single Symptom)

In [11]:
def interactive_single_test():
    """
    Interactive testing - shows only the #1 predicted symptom.
    """
    print("\n" + "="*80)
    print("INTERACTIVE SINGLE SYMPTOM TESTING")
    print("="*80)
    print("Enter patient descriptions to see the #1 predicted symptom.")
    print("Type 'quit' to exit.\n")
    
    while True:
        user_input = input("\nPatient says: ")
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("Goodbye!")
            break
        
        if not user_input.strip():
            continue
        
        # Extract top symptom
        result = extract_top_symptom(user_input)
        
        print("\n" + "-" * 40)
        print(f"üéØ Primary Symptom: {result['symptom']}")
        print(f"üìä Confidence: {result['percentage']}")
        print("-" * 40)

# Uncomment to run interactive mode
# interactive_single_test()

## 12. Quick Test Examples

In [12]:
# Quick test with the example from BioGPT
print("Quick Test Example:")
print("="*60)

test_input = "I have been pooping a lot, and it's very watery"
expected = "diarrhea"

print(f"Input: \"{test_input}\"")
print(f"Expected: {expected}\n")

result = extract_top_symptom(test_input)
print(f"Model Prediction: {result['symptom']}")
print(f"Confidence: {result['percentage']}")

is_match, reason = check_match(result['symptom'], expected)
if is_match:
    print(f"\n‚úÖ SUCCESS - {reason}")
else:
    print(f"\n‚ùå FAILED - {reason}")

Quick Test Example:
Input: "I have been pooping a lot, and it's very watery"
Expected: diarrhea

Model Prediction: constipation
Confidence: 80.7%

‚ùå FAILED - No match. Expected: 'diarrhea', Got: 'constipation'
