Design Rapid Experimentation Notebook for Generative AI Model Prototyping

In [None]:
# --------------------------------------------------------------
# 1. Notebook Setup and Imports
# --------------------------------------------------------------

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from datetime import datetime
import random
import os

# The following import brings 'display' and 'Markdown' into scope for later cell compatibility
from IPython.display import display, Markdown  # Fix: Needed so display() does not throw NameError

# Set random seeds for reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)

# Remove Jupyter/IPython-specific magic (%matplotlib inline) to ensure compatibility
# Fix: Do not use '%matplotlib inline' as it will fail outside of Jupyter. Instead, rely on plt.show() for display.

# INSTRUCTIONS:
# 1. Ensure all the above imports are present at the top of your notebook to access all required libraries for prototyping generative AI models.
# 2. The "seed" variable is set for reproducibility; you can reuse this variable to seed other frameworks or operations requiring deterministic behavior.
# 3. Use 'display' and 'Markdown' from IPython.display whenever you need to elegantly format notebook outputs or results.
# 4. Use 'plt.show()' instead of '%matplotlib inline' for displaying plots, especially if using this notebook outside of Jupyter environments.
# 5. You can now proceed to prototype your generative AI experiments by adding further code blocks in this notebook after this setup cell.

Exploratory Data Analysis of Ingested Healthcare Data using Visualization Tools

In [None]:
# --------------------------------------------------------------
# 2. Data Loading (Assumed present per instructions)
# --------------------------------------------------------------

# For demonstration purposes, we simulate the presence of both real healthcare data and a synthetic dataset.
# Replace these loading steps with your actual data ingestion code as appropriate.

# Below is a function intended to generate a synthetic healthcare dataset.
def generate_synthetic_healthcare_data(n=1000):
    # INSTRUCTIONS:
    # - Use numpy and pandas to construct a DataFrame with n rows that simulate healthcare information.
    # - Suggested columns: 'age', 'gender', 'bmi', 'blood_pressure', 'cholesterol', 'has_diabetes',
    #   'smoking_status', and 'hospital_visits_last_year'.
    # - Consider appropriate distributions (e.g., normal, binomial, poisson) for each column.
    # - After constructing the DataFrame, randomly insert NaN values in 'bmi' and 'blood_pressure' columns to represent missing data.
    # - Return the final DataFrame.
    pass  # Remove this after implementation

# This function should create a modified version of real_data, simulating a synthetic variant.
def generate_synthetic_version(real_data):
    # INSTRUCTIONS:
    # - Take the input DataFrame (real_data) and duplicate it.
    # - For columns such as 'bmi' and 'blood_pressure', fill missing values (e.g., using the column mean).
    # - Add random noise to these columns to make them slightly different from the real data.
    # - Return the modified synthetic version of the dataset.
    pass  # Remove this after implementation

# For actual deployments, replace the below lines with routines that load your own data files, e.g.:
# healthcare_data = pd.read_csv('healthcare_data.csv')
# synthetic_data = pd.read_csv('synthetic_dataset.csv')

# INSTRUCTIONS:
# - Call your implemented functions above to generate or load the real and synthetic datasets.
# - Example variable names: healthcare_data, synthetic_data
# - Use print statements (as shown) to display the shape of each dataset.
# - Display the head (first few rows) of the healthcare_data DataFrame to inspect the sample data.
# Replace the following with your own loading/invocation logic.
# healthcare_data = generate_synthetic_healthcare_data()
# synthetic_data = generate_synthetic_version(healthcare_data)
# print('Real healthcare data shape:', healthcare_data.shape)
# print('Synthetic data shape:', synthetic_data.shape)
# healthcare_data.head()


Visualizing Distributions: Demographics and Health Metrics

In [None]:
# --------------------------------------------------------------
# 3. Age Distribution & Gender Balance (Matplotlib & Seaborn)
# --------------------------------------------------------------

import matplotlib.pyplot as plt
import seaborn as sns

# Setup for visualization aesthetics
def set_plot_style():
    # INSTRUCTION:
    # - Use seaborn's set() function to define a visual style (e.g., 'whitegrid').
    # - Choose a color palette and enable color codes.
    # - Modify matplotlib's rcParams for figure size, label size, and title size as desired.
    # - This function should NOT create any plots, just set configs.
    pass

set_plot_style()

# -- Age Distribution --
# INSTRUCTION:
# 1. Create a new figure for the age distribution plot.
# 2. Use seaborn's histplot to plot a histogram for the 'age' column of the healthcare_data DataFrame.
#    - Set the number of bins (e.g., 30), enable kernel density estimation (kde=True).
#    - Choose appropriate color settings (e.g., blue tones) and set edgecolor for the bars.
# 3. Set plot title, x- and y-axis labels, and adjust layout for tightness.
# 4. Show the plot.
# (Do not include any analysis or Markdown display here.)

# Example variable (already implemented, do not change):
# healthcare_data: DataFrame containing at least the 'age' and 'gender' columns

# -- Gender Balance Pie Chart --
# INSTRUCTION:
# 1. Calculate gender_counts from the 'gender' column of healthcare_data using value_counts().
# 2. Create a new figure.
# 3. Plot a pie chart using plt.pie() with:
#     - 'gender_counts' for values.
#     - Labels corresponding to gender types (gender_counts.index).
#     - Display percentage ('autopct'), set a start angle, and specify colors if desired.
# 4. Set plot title, ensure the pie is drawn as a circle (axis('equal')), and adjust layout.
# 5. Show the plot.
# (Do not include any analysis or Markdown display here.)


Visualizing Feature Relationships and Missingness

In [None]:
# --------------------------------------------------------------
# 4. BMI vs Blood Pressure Scatter (Seaborn), Missing Data Heatmap (Matplotlib)
# --------------------------------------------------------------

# -- Relationship between BMI and Blood Pressure --
# Instructions:
# - Create a scatter plot to visualize the relationship between BMI and blood pressure.
# - Use Seaborn's scatterplot function, with 'bmi' on the x-axis and 'blood_pressure' on the y-axis.
# - Color the points by the 'has_diabetes' column, assigning appropriate palettes for diabetic and non-diabetic cases.
# - Set point transparency with alpha for better visibility.
# - Title the plot and label the axes ('BMI', 'Blood Pressure (mmHg)').
# - Add a legend identifying diabetic status ('No', 'Yes').
# - Use plt.tight_layout() to adjust spacing, then plt.show() to display.
# - Optionally, display a Markdown insight summarizing relationships and outliers you observe from the plot.
#
# Variables provided:
# - healthcare_data (DataFrame): Data source containing 'bmi', 'blood_pressure', and 'has_diabetes' columns.

# -- Missing Data Pattern Visualization --
# Instructions:
# - Visualize patterns of missing data using a heatmap.
# - Use the .isnull() method on healthcare_data to generate a boolean DataFrame indicating missing values.
# - Use Seaborn's heatmap to plot the missing values; adjust figure size as needed.
# - Remove color bar (cbar=False), and set yticklabels to False to hide row labels for clarity.
# - Use a color map such as 'mako_r', and set column labels (xticklabels) to show feature names.
# - Title the plot ('Missing Data Pattern in Healthcare Dataset'), label the x-axis ('Features').
# - Use plt.tight_layout() and plt.show() to render the plot.
# - Optionally, display a Markdown block summarizing which columns have missing data and implications for modeling.
#
# Variables provided:
# - healthcare_data (DataFrame): The dataset to analyze for missing data.


Comparing Real vs Synthetic Data: Feature Distributions and Correlations

In [None]:
# --------------------------------------------------------------
# 5. Comparing Real and Synthetic Data: Distribution Overlays & Correlation Matrix
# --------------------------------------------------------------

# Variables already present:
# continuous_cols = ['age', 'bmi', 'blood_pressure', 'hospital_visits_last_year']
# healthcare_data (DataFrame with real data)
# synthetic_data (DataFrame with synthetic data)

# Import necessary libraries (ensure these are imported at the start of your notebook/script):
# import matplotlib.pyplot as plt
# import seaborn as sns
# from IPython.display import display, Markdown

# ------------------- Distribution Overlays for Continuous Features -------------------
# INSTRUCTIONS:
# 1. Create a figure with an appropriate size for multiple subplots (e.g., 2 rows x 2 columns).
# 2. Loop over each column name in 'continuous_cols'. For each iteration:
#    a. Add a subplot for the current feature.
#    b. Plot the kernel density estimate (KDE) of the real data (from healthcare_data[col]), labeling it 'Real' and coloring it blue ('b').
#    c. Plot the KDE of the synthetic data (from synthetic_data[col]), labeling it 'Synthetic' and coloring it red ('r') with a dashed linestyle ('--').
#    d. Set the title of the subplot to indicate which feature is being shown.
#    e. Label the x-axis accordingly and add a legend to distinguish real vs synthetic.
# 3. Adjust subplot layout for a clean appearance.
# 4. Display the final overlay plot.

# (After plotting) In a markdown cell or using display(Markdown()), add insights/summary on how the distributions compare. Highlight any visible similarities or differences in the feature distributions between real and synthetic data and explain why this matters for data validation.

# ------------------- Correlation Heatmap: Real vs Synthetic -------------------
# INSTRUCTIONS:
# 1. Create a figure with an appropriate size and two side-by-side subplots (1 row, 2 columns).
# 2. In the first subplot:
#    a. Compute and display the correlation matrix (using .corr()) for the columns in 'continuous_cols' from 'healthcare_data'.
#    b. Plot the heatmap using seaborn with annotations, a suitable color map (e.g., 'Blues'), and a value range from -1 to 1.
#    c. Set the title to 'Correlation Matrix - Real Data'.
# 3. In the second subplot:
#    a. Repeat the above steps for 'synthetic_data', using a different color map (e.g., 'Reds'), and set the title accordingly.
# 4. Adjust layout and display the combined heatmaps.

# (After plotting) In a markdown cell or using display(Markdown()), add insights on the similarity or differences in the correlation structure of continuous features between real and synthetic datasets. Discuss the implications for downstream modeling and data use if correlations differ.


Summary of EDA Findings & Recommendations

In [None]:
# --------------------------------------------------------------
# 6. Summary and Recommendations (Markdown)
# --------------------------------------------------------------

# INSTRUCTIONS:
# In the cell below, provide a comprehensive Markdown summary of your findings from the Exploratory Data Analysis (EDA) process.
# You should include:
#  - Key findings from the data analysis, such as demographic breakdowns, feature associations, missing data, and notable patterns or anomalies.
#  - Areas where data quality could be improved (e.g., handling of missing values, detection/treatment of outliers).
#  - Any insights regarding the differences (if any) between synthetic and real data distributions or relationships.
#  - Clear recommendations for the next steps before modeling (such as imputation, further analysis, or cleaning actions), and considerations for using synthetic data if applicable.
#  - Make use of bullet points or subheadings for clarity.

# To render Markdown in a Jupyter notebook, use:
# from IPython.display import Markdown, display
# display(Markdown('''
# Your structured summary goes here.
# '''))

# Replace the multiline comment above with your detailed, actionable EDA summary and recommendations using Markdown.

Prototyping Clinical NLP Pipeline Using a Reusable Notebook

In [None]:
# --------------------------------------------------------------
# 7. Prototyping Clinical NLP Pipeline Using a Reusable Notebook
# --------------------------------------------------------------

# --- Imports for Clinical NLP ---
import os
import sys
import torch
import pandas as pd
from transformers import (AutoTokenizer, AutoModelForTokenClassification, pipeline, AutoConfig)
from collections import defaultdict
from IPython.display import display, Markdown, HTML
import seaborn as sns
import matplotlib.pyplot as plt
import random

# --- Ensure reproducibility ---
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

# --------------------------------------------------------------
# 1. Sample Clinical Notes Data Loading (Preloaded for Self-Contained Nbk)
# --------------------------------------------------------------

# Example clinical notes (can simulate a mini-corpus)
clinical_notes = [
    "The patient is a 63-year-old male with a history of hypertension, admitted for chest pain. Started on aspirin, atorvastatin, and lisinopril.",
    "This 44-year-old female with type 2 diabetes and obesity presents for evaluation. Metformin continued, blood pressure controlled.",
    "Patient admitted with shortness of breath and has chronic kidney disease stage 3b. Advised salt restriction and spironolactone.",
    "Discharge summary: 68-year-old with atrial fibrillation treated with rivaroxaban. Next visit in 3 months."
]
clinical_notes_df = pd.DataFrame({'note_id': range(1, len(clinical_notes) + 1), 'note_text': clinical_notes})

# Display the clinical notes table
print('Sample Clinical Notes:')
display(clinical_notes_df)

# --------------------------------------------------------------
# 2. Define Utility for NER Display with Highlighting
# --------------------------------------------------------------
def highlight_entities(note, entities):
    """
    Highlights extracted entities directly within the clinical note text for in-notebook visualization.
    Args:
      note (str): Original note text.
      entities (list): List of dicts, each containing 'start', 'end', 'entity', 'word'.
    Returns:
      HTML string with highlighted entities.
    """
    # Instructions:
    # 1. Initialize a list named `chunks` and a variable `idx` to track the current index in the text.
    # 2. Define a color mapping dictionary named `colored` that maps entity types to background colors.
    # 3. Iterate over the entities sorted by their 'start' index.
    # 4. For each entity, extract substring before entity (from idx to entity start) and append to `chunks`.
    # 5. For each entity, determine background color from the `colored` dictionary (default to '#a3d2ca' if missing).
    # 6. Create an HTML span that contains the entity word with the background color (and optionally the label as a subscript), and append to `chunks`.
    # 7. Update `idx` to the end index of this entity.
    # 8. After looping through entities, append any remaining text after the last entity.
    # 9. Join all elements of `chunks` and return as an IPython HTML display object.
    pass

# --------------------------------------------------------------
# 3. Prototyping: Transformer-Based NER Models for Clinical Text
# --------------------------------------------------------------
# We'll use at least two different transformer models/configs to compare performance.
# 1. 'emilyalsentzer/Bio_ClinicalBERT' (Clinical BERT, medical domain)
# 2. 'd4data/biomedical-ner-all' (Biomedical/Drug/Diagnosis NER, pre-finetuned)

model_configs = [
    {
        'label': 'ClinicalBERT (emilyalsentzer/Bio_ClinicalBERT)',
        'model_name': 'emilyalsentzer/Bio_ClinicalBERT',
        'ner_pipe_kwargs': {'aggregation_strategy': 'simple'},
        'notes': 'A generic clinical BERT. May require extra fine-tuning or mapping, but is widely used for medical notes.'
    },
    {
        'label': 'BioMed NER (d4data/biomedical-ner-all)',
        'model_name': 'd4data/biomedical-ner-all',
        'ner_pipe_kwargs': {'aggregation_strategy': 'simple'},
        'notes': 'Pre-finetuned model for BioNLP NER (disease, drug, chemical, gene, anatomy, etc). Fast rapid prototyping.'
    }
]

extracted_entities_rounds = defaultdict(list)

# Instructions for the next block:
# 1. For each config (in model_configs), display its label and indicate it's loading the model.
# 2. Load the tokenizer and model using the model name from the config.
# 3. Create an NER pipeline with these and any kwargs provided.
# 4. If loading fails, catch and display exception and move to the next config.
# 5. For each note in clinical_notes_df:
#    a. Extract the note text.
#    b. Extract entities for the note using the NER pipeline.
#    c. Prepare a summary DataFrame (with NER results, note_id, and note).
#    d. Append summary to results_summary, and save entity extraction to extracted_entities_rounds.
# 6. After processing all notes, combine result summaries into all_summary.
# 7. Display a Markdown heading and the resulting summary table (if it's not empty), otherwise indicate no entities were extracted.
# 8. For the first two sample notes, display the note and show highlighted HTML using the highlight_entities utility.
# 9. Plot a bar chart of entity label frequency if extracted entities are found.
#10. Finally, display model notes in Markdown for the current config.

# --------------------------------------------------------------
# 4. Comparative Analysis of Extracted Entities
# --------------------------------------------------------------
# Instructions:
# 1. Initialize lists to store comparison information (compare_table, entity_set_by_model).
# 2. For each model config:
#    a. Get model label and list of extracted entity rounds.
#    b. Collect all entity mentions (as strings) for that model.
#    c. Store set of entity identifiers and collect unique labels and count.
# 3. Combine comparison information into a DataFrame (compare_df).
# 4. Display Markdown heading and the compare_df table.
# 5. If using two models, compute and display overlap and unique entity counts between them in Markdown.

# --------------------------------------------------------------
# 5. Documentation: Model Choices, Effectiveness & Limitations
# --------------------------------------------------------------
# Instructions:
# 1. Display Markdown cell documenting:
#    - The differences between model choices made earlier
#    - Which model was most effective and why
#    - Limitations of the approach (as per guidelines)
#    - A summary of the reusable workflow and suggestions for how to extend it


Synthesis and Interpretation: Integrated Data Visualization for Prototyping Results

In [None]:
# --------------------------------------------------------------
# 8. Integrated Visual Dashboard: Synthesizing EDA, Generative & NLP Prototyping Results
# --------------------------------------------------------------

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from IPython.display import display, Markdown
from matplotlib.gridspec import GridSpec

# Assumes the following already exist (previous cells):
# - healthcare_data: pandas.DataFrame of real data
# - synthetic_data: pandas.DataFrame of synthetic data
# - clinical_notes_df: pandas.DataFrame of clinical texts
# - extracted_entities_rounds: dict of model_label -> list of NER entity spans per note

# -- 1. Demographics Fidelity Overlay --
# INSTRUCTION: Overlay the age distributions from the real (healthcare_data['age'])
# and synthetic (synthetic_data['age']) datasets on a histogram or KDE plot for visual comparison.
# Do the same for blood pressure (healthcare_data['blood_pressure'], synthetic_data['blood_pressure']).
# Annotate axes, legends, and provide titles to indicate the visualization purpose.
# After plotting, display a markdown cell summarizing interpretation or observed alignment/discrepancies.

# -- 2. Feature Correlation Matrix with Fidelity Delta --
# INSTRUCTION: Compute correlation matrices for selected columns (
# e.g., ['age', 'bmi', 'blood_pressure', 'hospital_visits_last_year']) from both
# healthcare_data and synthetic_data. Calculate and plot the difference (delta) between
the two matrices. Use heatmaps with proper titles and colorbars to visually compare.
# Add markdown output summarizing insights and areas for further tuning.

# -- 3. Model Iterations: Tracking Synthetic Data Quality Over Rounds
# INSTRUCTION: If you have multiple synthetic datasets from different generative model versions,
# overlay the distributions of a feature (e.g., 'bmi') from real and the different synthetic datasets for comparison.
# Define a function (e.g., generate_v2_synth) to simulate/model the generation
# of a new synthetic dataset based on the real data.
# Plot and compare the resulting distributions to track improvements. Annotate the plot and add a markdown cell with interpretation.

# Example function definition stub for generating synthetic data:
def generate_v2_synth(real_data):
    # INSTRUCTION: Create a new synthetic DataFrame based on statistical properties of real_data.
    # For example, fill missing values with means and introduce small random noise to simulate synthetic values.
    # Return the modified synthetic DataFrame.
    pass

# -- 4. Composite Dashboard: Clinical NLP Results & EDA Join --
# INSTRUCTION: Aggregate entity mentions from the extracted_entities_rounds dictionary for a selected NER model.
# Flatten the list of entity records into a pandas DataFrame with columns ['note_ix', 'entity', 'word'].
# Visualize the most frequent entity types with a bar plot. Annotate the plot and output markdown interpretation.
# Then, join structured EHR data (e.g., diabetes diagnosis count in healthcare_data) with NLP-extracted disease mentions for comparison.
# Plot the counts side by side and provide descriptive interpretation.

# -- 5. Integrated Table: Prototyping Journey and Reusability Summary --
# INSTRUCTION: Create a pandas DataFrame tracking key stages, outputs, and reusable blocks of the prototyping process.
# Display this as a Markdown section and a table to guide notebook usability and explain modular design.
# Summarize in markdown how each section can be adapted or reused in new projects or for reporting.

# -- 6. Exporting Visualizations for Reporting/Reproducibility --
# INSTRUCTION: Capture and export the last (or key) Matplotlib figure to an image file (e.g., PNG) for inclusion in external documentation.
# Use try/except to catch possible errors, and print a status message. Document/recommend that all displayed visualizations can be exported
# as reusable assets for regulatory or hand-off reporting, and summarize this in a final markdown cell.


Perform Exploratory Data Analysis (EDA) Using Matplotlib and Seaborn

In [None]:
# Exploratory Data Analysis on Healthcare Datasets

# In this notebook, you will conduct exploratory data analysis (EDA) on structured healthcare data using matplotlib and seaborn.
# Focus on uncovering trends, distributions, and potential anomalies in the data.

# --- 1. Imports & Setup ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Inline plotting for Jupyter Notebooks
try:
    get_ipython()
    # If running inside a Jupyter Notebook, uncomment and use %matplotlib inline to enable inline plotting
    # %matplotlib inline  # Uncomment this line if you're in Jupyter Notebook
except NameError:
    pass  # Not running inside IPython/Jupyter

# Set the Seaborn theme for better aesthetics in plots
sns.set_theme(style="whitegrid")

# --- 2. Data Loading ---
# Create a synthetic healthcare dataset with relevant columns for EDA:
# - 'patient_id': Unique identifier for each patient
# - 'age': Age of the patient
# - 'gender': Gender of the patient ('Male' or 'Female')
# - 'admission_type': Type of admission ('Emergency', 'Elective', or 'Urgent')
# - 'systolic_bp': Systolic blood pressure
# - 'diastolic_bp': Diastolic blood pressure
# - 'glucose': Blood glucose level
# - 'length_of_stay': Length of hospital stay (days)
# - 'discharge_status': Status at discharge ('Home', 'Transferred', 'Deceased')
#
# Use numpy random functions to generate plausible synthetic data for each column.
# Store the result as a dictionary and convert to a pandas DataFrame named 'health_df'.
#
# Example:
# data = {
#     'patient_id': range(...),
#     'age': np.random.normal(...),
#     ...
# }
# health_df = pd.DataFrame(data)

# After creating the DataFrame, implement:
# - Clipping and rounding on numerical columns to keep values plausible (e.g., age between 0 and 100, rounding for lab values, making sure length_of_stay is positive and rounded)
# Example:
# health_df['age'] = health_df['age'].clip(lower=0, upper=100)
# health_df['glucose'] = np.round(health_df['glucose'], 1)
# ...

# --- 3. Preview & Summary ---
#
# Import the 'display' function from IPython.display to display pandas DataFrames in a readable format.
#
# Show a preview of the health_df DataFrame using display(health_df.head()).
# Print the shape of the dataset to understand dimensionality.
# Use display(health_df.describe(include='all')) to output the dataset summary, describing statistics for both numerical and categorical columns.
#
# Example steps:
# from IPython.display import display
# display(health_df.head())    # View the first few rows
# print('Shape:', health_df.shape)    # Output the number of rows and columns
# display(health_df.describe(include='all'))    # Get summary statistics for all columns

# Continue to further steps by visualizing distributions, relationships, and looking for anomalies or interesting patterns in the data using matplotlib and seaborn.

Experiment with Reusable NLP Pipeline Notebook for Clinical Notes

In [None]:
# --- Reusable NLP Pipeline for Clinical Text Understanding ---
#
# This notebook demonstrates rapid experimentation with transformer-based NLP models for
# clinical named entity recognition (NER) and information extraction. You can select different models, adjust pipeline parameters,
# and observe effects on medical entity extraction (diagnoses, symptoms, medications, etc).
#
# --- 1. Setup & Imports ---

import os
import random
import numpy as np
import pandas as pd
from typing import List, Dict
from IPython.display import display, Markdown

import warnings
warnings.filterwarnings('ignore')

# Hugging Face transformers for modern NER
from transformers import pipeline

# --- 2. Load Sample, De-identified Clinical Notes ---
# For demonstration, we'll use synthetic, de-identified clinical text snippets.
# In practice, replace 'clinical_notes_df' with your real data (ensuring all PHI is properly handled).

sample_notes = [
    "Patient presents with 2-day history of chest pain. Past history of hypertension, diabetes. Medications include metformin and lisinopril. EKG shows normal sinus rhythm.",
    "Admitted for acute shortness of breath. Started on intravenous furosemide. Oxygen saturation 91%. Diagnosed with heart failure exacerbation. Discharged on spironolactone.",
    "Complaints of worsening cough. Prescribed azithromycin for suspected pneumonia. No prior COPD or asthma noted.",
    "Severe headache, vision changes. Brain MRI scheduled. No evidence of infection. Monitored for possible migraine or vascular event.",
    "Reports chronic back pain, managed on acetaminophen. MRI lumbar shows mild spondylosis."    
]
clinical_notes_df = pd.DataFrame({'note_id': range(1, len(sample_notes)+1), 'note_text': sample_notes})

display(Markdown('### Sample De-identified Clinical Notes'))
display(clinical_notes_df)

# --- 3. NLP Model Selection ---
# Define several transformer-based NER pipelines (from HuggingFace Hub), suitable for clinical/biomedical entity extraction.
# You may experiment with different models for comparison.

available_models = {
    'distilbert-base-uncased-finetuned-ner': {
        'description': 'General-purpose NER (baseline)',
        'model_name': 'distilbert-base-uncased-finetuned-ner',
        'entity_types': 'person, org, loc, misc (general NER; use as control)'},
    'dslim/bert-base-NER': {
        'description': 'BERT for general NER (benchmark)',
        'model_name': 'dslim/bert-base-NER',
        'entity_types': 'person, org, loc, misc (general NER; control)'},
    'emilyalsentzer/Bio_ClinicalBERT': {
        'description': 'Bio_ClinicalBERT: Clinical/biomedical text',
        'model_name': 'emilyalsentzer/Bio_ClinicalBERT',
        'entity_types': 'biomedical: diseases, symptoms, medications (may require fine-tuning, demo only)'},
    'kamalkraj/BioBERT-NER': {
        'description': 'BioBERT NER (biomedical baseline)',
        'model_name': 'kamalkraj/BioBERT-NER',
        'entity_types': 'biomedical: diseases, chemicals, genes, symptoms'},
    # Add more domain-specific models as desired
}

print('Available models:')
for idx, (k, v) in enumerate(available_models.items()):
    print(f"[{idx}] {k} - {v['description']} ({v['entity_types']})")
    
# You may modify here to experiment
model_keys = list(available_models.keys())
model_index = 2  # Default: use 'emilyalsentzer/Bio_ClinicalBERT' (or change to try others)
model_choice = model_keys[model_index]
model_info = available_models[model_choice]

print(f"
Selected model: {model_info['model_name']}
  Description: {model_info['description']}
  Entity Types: {model_info['entity_types']}")

# --- 4. Build the Pipeline ---

ner_pipe = pipeline('ner', model=model_info['model_name'], tokenizer=model_info['model_name'], aggregation_strategy="simple")

# Parameters to experiment with
aggregation_strategy = 'simple'  # Try 'none', 'first', 'average', 'simple' (see documentation)
max_length = 256  # Adjust as needed for context window (esp. for long clinical notes)

# --- 5. Run Entity Extraction Pipeline ---
def extract_entities(texts: List[str],
                    nlp_pipe,
                    aggregation_strategy: str = 'simple',
                    max_length: int = 256) -> List[List[Dict]]:
    """Apply NER pipeline to a list of texts, return extracted entities for each."""
    # Instructions:
    # 1. Initialize an empty list called 'results' to store entities extraction results for each input text.
    results = []
    # 2. Loop through each text in the 'texts' input list.
    for text in texts:
        # 3. For each text, apply the provided 'nlp_pipe' (which should be a Hugging Face NER pipeline).
        #    - Use the aggregation_strategy and max_length parameters when calling the pipeline.
        #    - If the pipeline runs successfully, append the list of entity predictions (dictionaries) to the 'results' list.
        #    - If an exception occurs during prediction, append an empty list for this text to 'results'.
        pass  # Implement logic as described above
    # 4. Return the 'results' list containing entity lists for each text.
    pass

clinical_notes_df['entities'] = extract_entities(
    clinical_notes_df['note_text'].tolist(),
    ner_pipe,
    aggregation_strategy=aggregation_strategy,
    max_length=max_length
)

def display_entities(df: pd.DataFrame, limit: int = 5):
    """Nicely display clinical notes and their recognized entities."""
    # Instructions:
    # 1. Iterate through the first 'limit' rows of the DataFrame 'df'.
    # 2. For each row:
    #    a) Retrieve the list of entities from the 'entities' column.
    #    b) For each entity, extract the relevant values such as:
    #       - The recognized text (use 'word' or fallback to 'entity_group')
    #       - The entity label ('entity_group' or fallback to 'entity')
    #       - The confidence score if available ('score')
    #    c) Build a Markdown-formatted string that lists all recognized entities and their scores for each note.
    #    d) If no entities are present, indicate this in the Markdown.
    #    e) For the note text, wrap it for display using another utility (e.g. text_wrap).
    #    f) Present the final Markdown with the note text and extracted entities using IPython's display(Markdown(...)).
    pass  # Write the implementation as described above

def text_wrap(text: str, width: int = 80) -> str:
    """Utility for word-wrapping text for readability in display."""
    # Instructions:
    # 1. Use the textwrap module to wrap the 'text' string so that each line is at most 'width' characters wide.
    # 2. Return the wrapped text joined by newline characters.
    pass  # Implement using textwrap.wrap as described

display(Markdown('---
#### Entity Recognition Results (First 5 Notes):'))
display_entities(clinical_notes_df, limit=5)

# --- 6. Experimentation: Try Swapping Models or Parameters ---
# You can rerun the pipeline with different 'available_models', 'aggregation_strategy', or 'max_length'.
# For demonstration, let's run with a general-domain model for comparison:

other_model_key = 'distilbert-base-uncased-finetuned-ner'
other_model_info = available_models[other_model_key]
print(f"
Comparison: Running with control model: {other_model_info['model_name']}")
other_ner_pipe = pipeline('ner', model=other_model_info['model_name'], tokenizer=other_model_info['model_name'], aggregation_strategy=aggregation_strategy)
clinical_notes_df['entities_control'] = extract_entities(
    clinical_notes_df['note_text'].tolist(),
    other_ner_pipe,
    aggregation_strategy=aggregation_strategy,
    max_length=max_length
)

def compare_entities(df: pd.DataFrame, limit: int = 5):
    # Instructions:
    # 1. Iterate through the first 'limit' rows of the DataFrame 'df'.
    # 2. For each row:
    #    a) Retrieve entities extracted by the domain-specific and control (general) models from 'entities' and 'entities_control' columns.
    #    b) For both sets of entities, create two lists of strings:
    #       - For each entity, extract relevant fields: label and recognized text.
    #    c) If there are no entities, indicate this with '*None*' in the Markdown.
    #    d) Use the text_wrap utility to format and present the note text.
    #    e) Construct a Markdown string displaying:
    #       - The note text
    #       - Entity list for the domain model
    #       - Entity list for the control/general model
    #    f) Use display(Markdown(...)) to render the comparison for each note.
    pass  # Write the implementation as described

display(Markdown('### Model Comparison: Clinical vs General NER Results'))
compare_entities(clinical_notes_df, limit=5)

# --- 7. Documentation: Experimentation & Model Performance Notes ---

experiment_notes = '''
## Experiment Notes: Rapid NLP Pipeline Prototyping

- **Model Selection Impact:**
    - Domain-specific models (e.g., Bio_ClinicalBERT, BioBERT-NER) tend to extract medical concepts (diagnoses, symptoms, medications) more accurately and with appropriate labeling, compared to general NER baselines.
    - General models (like distilbert-base-uncased-finetuned-ner) primarily label entities as PERSON/ORG/LOC/MISC; clinical details may be missed or mis-labeled.

- **Parameter Choices:**
    - `aggregation_strategy` controls entity grouping and can affect granularity. For short input, 'simple' works well; 'none' returns more verbose outputs.
    - `max_length` impacts handling of long clinical notes. For very long notes, split into sentences or adjust max_length accordingly.
    - Some biomedical NER models do not support aggregation_strategy or may have different output formats (check documentation as models evolve).

- **Observations:**
    - The reusable, parameterized pipeline structure allows rapid swapping of models and tuning for best entity extraction quality.
    - Differences in recognized entities and label types are evident when changing from a domain-specific to a general-purpose model.
    - For production, prefer clinical-domain models when available, and always evaluate model performance using a labeled test set.
'''
display(Markdown(experiment_notes))


Analyze and Visualize Synthetic Data Generated via Reusable Notebook

In [None]:
# Synthetic Data Generation, Visualization, and Privacy/Fidelity Analysis for Healthcare

# --- 1. Setup: Imports & Environment ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown
import random

sns.set_theme(style="whitegrid")

# --- 2. Select Data Type to Generate (Tabular or Textual) ---

def select_data_type():
    # INSTRUCTION:
    # Prompt the user to select which type of data to generate: tabular synthetic patient data (option 1) or synthetic clinical notes (option 2).
    # Accept user input ('1' or '2'), provide a default for non-interactive scenarios, and return the choice as a string.
    pass

# In notebooks, we can gracefully default to tabular data for reproducibility
try:
    # INSTRUCTION:
    # Call the select_data_type() function to get the user's choice. If an error occurs (e.g., non-interactive environment), default the selection to '1' (tabular).
    data_type_choice = select_data_type()
except Exception:
    data_type_choice = '1'

# --- 3A. Generate Synthetic Tabular Healthcare Data ---
def generate_synthetic_tabular(n_samples: int = 300, random_seed: int = 42) -> pd.DataFrame:
    # INSTRUCTION:
    # 1. Set the random seed for both 'random' and 'numpy' modules to ensure reproducibility.
    # 2. Create a dictionary called 'data' with the following keys:
    #   - 'patient_id': sequential integers for each sample.
    #   - 'age': normally-distributed ages (mean=57, std=17), clipped between 0 and 100 and converted to integer.
    #   - 'sex': randomly assign 'Male' or 'Female' for each sample.
    #   - 'admission_type': randomly choose among 'Emergency', 'Elective', 'Urgent' with probabilities 0.5, 0.3, 0.2 respectively.
    #   - 'systolic_bp': normally-distributed values (mean=128, std=16).
    #   - 'diastolic_bp': normally-distributed values (mean=77, std=9).
    #   - 'glucose': normally-distributed values (mean=105, std=21).
    #   - 'length_of_stay': absolute value of normally-distributed numbers (mean=5.5, std=3.1).
    #   - 'discharge_status': randomly select 'Home', 'Transferred', 'Deceased' with probabilities 0.82, 0.14, 0.04.
    # 3. Create a pandas DataFrame from this dictionary.
    # 4. Apply rounding or clipping for realism as required (e.g., age, blood pressures, glucose, length of stay).
    # 5. Return the DataFrame.
    pass

# --- 3B. Generate Synthetic Clinical Notes (Textual) ---
def generate_synthetic_notes(n_samples: int = 10, random_seed: int = 42) -> pd.DataFrame:
    # INSTRUCTION:
    # 1. Set the random seed for both the 'random' library and 'numpy' to ensure reproducibility.
    # 2. Define lists for diseases, medications, findings, diagnostics, and note templates (these can be simply defined as lists of plausible values as in the original code).
    # 3. For each sample (from 0 to n_samples-1):
    #    a. Randomly choose one template from the 'templates' list.
    #    b. For each placeholder in the template (e.g., [finding], [disease1], [disease2], [med1], [med2], [diagnostic], [finding2]),
    #       replace it with a random choice from the corresponding list.
    #    c. For specific template placeholders (e.g., '{diagnostic}'), also perform appropriate substitution.
    #    d. Append the generated note to a list.
    # 4. Construct and return a pandas DataFrame with columns 'note_id' (sequential integers) and 'note_text' (the generated notes).
    pass

# --- 4. Load or Generate (Optionally Also Load 'real' Data for Comparison) ---
# INSTRUCTION:
# Depending on 'data_type_choice', generate the appropriate synthetic data.
# For tabular, also attempt to load a variable 'health_df' as 'real' data for comparison; if not available, generate a second synthetic dataset with a different random seed.
# Display a preview (using display and Markdown functions) of the resulting synthetic data.
if data_type_choice == '1':
    # Tabular synthetic data
    synthetic_df = None  # Replace with: call generate_synthetic_tabular() and assign its output
    try:
        # Try to use the dataset from previous activity as 'real' data for comparison
        # Assume variable health_df is present; otherwise, generate similar real data
        real_df = health_df.copy()
    except Exception:
        real_df = None  # Replace with: call generate_synthetic_tabular() with a different random_seed
    # Display instructions for the preview
    # display(Markdown('### Tabular Synthetic Healthcare Data (*first 6 rows*)'))
    # display(synthetic_df.head(6))
    pass
else:
    # Synthetic textual data
    synthetic_df = None  # Replace with: call generate_synthetic_notes() and assign its output
    # Display a preview of the synthetic notes
    # display(Markdown('### Synthetic Clinical Notes (*first 5*)'))
    # display(synthetic_df.head(5))
    pass

# --- 5. Visualize and Compare Distributions (Tabular Data) ---

def plot_distribution_compare(syn: pd.DataFrame, real: pd.DataFrame, column: str, bins: int = 20, title: str = None):
    # INSTRUCTION:
    # 1. For the specified column, plot the distribution of 'real' and 'synthetic' data as overlaid histograms.
    # 2. Use seaborn's 'histplot' for both, set density to True, choose colors, add a KDE, and set labels.
    # 3. Set the title and axis labels; show the plot.
    pass

def plot_categorical_compare(syn: pd.DataFrame, real: pd.DataFrame, column: str, title: str = None):
    # INSTRUCTION:
    # 1. For a specified categorical column, compute the normalized (proportional) value counts for 'real' and 'synthetic' data.
    # 2. Create a side-by-side bar plot to compare category proportions.
    # 3. Set legend, x/y labels, and plot title; show the plot.
    pass

if data_type_choice == '1':
    # INSTRUCTION:
    # For tabular data: 
    # 1. For each continuous column (e.g., 'age', 'systolic_bp', 'diastolic_bp', 'glucose', 'length_of_stay'), call plot_distribution_compare on synthetic_df and real_df.
    # 2. For each categorical column (e.g., 'sex', 'admission_type', 'discharge_status'), call plot_categorical_compare.
    pass

# --- 6. NLP and Statistics for Synthetic Textual Data ---
if data_type_choice == '2':
    # INSTRUCTION:
    # 1. Add columns to 'synthetic_df' for statistics such as number of words and number of characters in each note.
    # 2. Display basic descriptive statistics for these columns using pandas .describe().
    # 3. To measure text diversity:
    #    a. Concatenate all note texts to a single string and split into tokens (words).
    #    b. Calculate the size of the unique vocabulary (set size), and the most common n words using collections.Counter.
    #    c. Display these statistics via Markdown display.
    # 4. Optionally, plot a histogram showing the distribution of note lengths (number of words) using seaborn.
    pass

# --- 7. Privacy & Utility Discussion (Markdown cell) ---
report_md = '''
## Utility and Privacy of Generated Synthetic Data

- **Utility:**
    - Synthetic tabular data imitates real patient distributions and enables rapid prototyping for research, visualization, or algorithmic testing without patient privacy risk.
    - The synthetic clinical notes reflect plausible combinations of medical concepts, supporting NLP model testing and iterative prompt engineering.
    - Visual comparison demonstrates good alignment in/univariate statistics; for deeper fidelity, advanced methods (e.g. GANs, copulas, or language models) may be considered.

- **Privacy:**
    - All data is programmatically generated, guaranteeing that no individually-identifying patient information is present.
    - Distributional or summary-level attacks are not meaningful (there is no one-to-one mapping with actual patients).
    - For deployment in sensitive settings, privacy-preserving approaches (differential privacy, noise injection, output audits) can further strengthen guarantees.

**Conclusion:**

This notebook supports safe, rapid synthetic healthcare data prototyping, visualization, and model pipeline validation, with explicit separation from true patient data. Researchers should always validate downstream analysis pipelines for generalizability beyond synthetic benchmarks.
'''
display(Markdown(report_md))


Perform Exploratory Data Analysis (EDA) Using Matplotlib and Seaborn

In [None]:
# Exploratory Data Analysis on Healthcare Datasets

# In this notebook, you will perform exploratory data analysis (EDA) on a structured healthcare dataset.
# You should use matplotlib and seaborn to visualize and interpret key aspects, focusing on trends and potential anomalies.

# --- 1. Imports & Setup ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Inline plotting for Jupyter Notebooks
# (If running in Jupyter, uncomment the next line)
# %matplotlib inline  # noqa: E402,F821

# Set the seaborn theme for plots
sns.set_theme(style="whitegrid")

# --- 2. Data Loading ---
# For demonstration purposes, this code block creates a synthetic healthcare dataset with common columns.
# You should create or replace this data loading section with your own dataset loading logic (using pd.read_csv or similar) if available.
data = {
    'patient_id': range(1, 301),
    'age': np.random.normal(loc=55, scale=18, size=300).astype(int),
    'gender': np.random.choice(['Male', 'Female'], size=300),
    'admission_type': np.random.choice(
        ['Emergency', 'Elective', 'Urgent'],
        size=300, p=[0.5, 0.3, 0.2]
    ),
    'systolic_bp': np.random.normal(loc=130, scale=15, size=300),
    'diastolic_bp': np.random.normal(loc=80, scale=8, size=300),
    'glucose': np.random.normal(loc=100, scale=25, size=300),
    'length_of_stay': np.abs(np.random.normal(loc=6, scale=3, size=300)),
    'discharge_status': np.random.choice(
        ['Home', 'Transferred', 'Deceased'],
        size=300, p=[0.8, 0.16, 0.04]
    )
}
health_df = pd.DataFrame(data)

# Clip and round values to realistic ranges
# Already implemented; no additional actions required unless using your own data.
health_df['age'] = health_df['age'].clip(lower=0, upper=100)
health_df['glucose'] = np.round(health_df['glucose'], 1)
health_df['systolic_bp'] = np.round(health_df['systolic_bp'], 1)
health_df['diastolic_bp'] = np.round(health_df['diastolic_bp'], 1)
health_df['length_of_stay'] = np.round(health_df['length_of_stay'], 1)

# --- 3. Preview & Summary ---
# You should preview the first few rows and summary of the dataframe to get an overview of the data.
# 1. Display the first 5 rows of health_df to get a sense of its structure.
#    Use: health_df.head()
# 2. Print the shape of the dataframe to know the number of records and features.
#    Use: health_df.shape
# 3. Show descriptive statistics (count, mean, std, min, max, etc.) for all columns, including categorical ones.
#    Use: health_df.describe(include='all')
# 4. (If using Jupyter) You may use display() from IPython.display for nicely formatted output.
#    Example: from IPython.display import display
#             display(health_df.head())
#
# Implement the above steps in this section to gain basic insights into the dataset before proceeding with deeper EDA.


Experiment with Reusable NLP Pipeline Notebook for Clinical Notes

In [None]:
# --- Reusable NLP Pipeline for Clinical Text Understanding ---
#
# This notebook demonstrates rapid experimentation with transformer-based NLP models for
# clinical named entity recognition (NER) and information extraction. You can select different models, adjust pipeline parameters,
# and observe effects on medical entity extraction (diagnoses, symptoms, medications, etc).
#
# --- 1. Setup & Imports ---

import os
import random
import numpy as np
import pandas as pd
from typing import List, Dict
from IPython.display import display, Markdown

import warnings
warnings.filterwarnings('ignore')

# Hugging Face transformers for modern NER
from transformers import pipeline

# --- 2. Load Sample, De-identified Clinical Notes ---
# For demonstration, we'll use synthetic, de-identified clinical text snippets.
# In practice, replace 'clinical_notes_df' with your real data (ensuring all PHI is properly handled).

sample_notes = [
    # Define one or more synthetic clinical note strings here, each representing a de-identified note.
    # Example: "Patient presents with..."
]
clinical_notes_df = pd.DataFrame({'note_id': range(1, len(sample_notes)+1), 'note_text': sample_notes})

display(Markdown('### Sample De-identified Clinical Notes'))
display(clinical_notes_df)

# --- 3. NLP Model Selection ---
# Define several transformer-based NER pipelines (from HuggingFace Hub), suitable for clinical/biomedical entity extraction.
# You may experiment with different models for comparison.

available_models = {
    'distilbert-base-uncased-finetuned-ner': {
        'description': 'General-purpose NER (baseline)',
        'model_name': 'distilbert-base-uncased-finetuned-ner',
        'entity_types': 'person, org, loc, misc (general NER; use as control)'},
    'dslim/bert-base-NER': {
        'description': 'BERT for general NER (benchmark)',
        'model_name': 'dslim/bert-base-NER',
        'entity_types': 'person, org, loc, misc (general NER; control)'},
    'emilyalsentzer/Bio_ClinicalBERT': {
        'description': 'Bio_ClinicalBERT: Clinical/biomedical text',
        'model_name': 'emilyalsentzer/Bio_ClinicalBERT',
        'entity_types': 'biomedical: diseases, symptoms, medications (may require fine-tuning, demo only)'},
    'kamalkraj/BioBERT-NER': {
        'description': 'BioBERT NER (biomedical baseline)',
        'model_name': 'kamalkraj/BioBERT-NER',
        'entity_types': 'biomedical: diseases, chemicals, genes, symptoms'},
    # Add more domain-specific models as desired
}

print('Available models:')
for idx, (k, v) in enumerate(available_models.items()):
    print(f"[{idx}] {k} - {v['description']} ({v['entity_types']})")
# You may modify here to experiment
model_keys = list(available_models.keys())
model_index = 2  # Default: use 'emilyalsentzer/Bio_ClinicalBERT' (or change to try others)
model_choice = model_keys[model_index]
model_info = available_models[model_choice]

print(f"
Selected model: {model_info['model_name']}
  Description: {model_info['description']}
  Entity Types: {model_info['entity_types']}")

# --- 4. Build the Pipeline ---

# Instantiate the NER pipeline with chosen model and tokenizer.
ner_pipe = pipeline('ner', model=model_info['model_name'], tokenizer=model_info['model_name'], aggregation_strategy="simple")

# Parameters to experiment with
aggregation_strategy = 'simple'  # Try 'none', 'first', 'average', 'simple' (see documentation)
max_length = 256  # Adjust as needed for context window (esp. for long clinical notes)

# --- 5. Run Entity Extraction Pipeline ---
def extract_entities(texts: List[str],
                    nlp_pipe,
                    aggregation_strategy: str = 'simple',
                    max_length: int = 256) -> List[List[Dict]]:
    """Apply NER pipeline to a list of texts, return extracted entities for each."""
    # Instructions:
    # 1. Initialize an empty list (e.g., 'results') to store entity extraction output for each text.
    # 2. For each text string in 'texts':
    #    - Pass the text through the 'nlp_pipe' pipeline, specifying 'aggregation_strategy', 'truncation' as True, and 'max_length'.
    #    - Handle any exceptions (such as runtime/model errors) gracefully; if an error occurs for a text, append an empty list [].
    #    - Append the resulting entities for the text to 'results'.
    # 3. Return the 'results' list which is a list of entity lists (per input text).
    pass  # Remove this and implement as per above instructions

# Apply extract_entities to the notes dataframe to create an 'entities' column.
# Call 'extract_entities' with clinical_notes_df['note_text'].tolist(), ner_pipe, aggregation_strategy, and max_length as arguments.
clinical_notes_df['entities'] = None  # Implement the population of this column with extracted entities per note.


def display_entities(df: pd.DataFrame, limit: int = 5):
    """Nicely display clinical notes and their recognized entities."""
    # Instructions:
    # 1. For each row (up to 'limit') in the input dataframe 'df':
    #    a. Retrieve the clinical note and corresponding list of extracted entities.
    #    b. If entities exist, iterate through them and construct readable markdown strings with label, text, and score.
    #    c. If no entities are recognized, indicate so in the output.
    #    d. Use the 'text_wrap' function to wrap the clinical note text for display.
    #    e. Use IPython 'display' and 'Markdown' utilities to visually output the note and its entities.
    pass  # Implement the details above to display entity recognition results for the dataframe


def text_wrap(text: str, width: int = 80) -> str:
    """Utility for word-wrapping text for readability in display."""
    # Instructions:
    # - Use Python's textwrap module to wrap 'text' to the specified 'width'.
    # - Return the wrapped string, joining individual lines with '
'.
    pass  # Implement this utility for word-wrapping

# Display entity recognition results (first 5 notes).
display(Markdown('---
#### Entity Recognition Results (First 5 Notes):'))
display_entities(clinical_notes_df, limit=5)

# --- 6. Experimentation: Try Swapping Models or Parameters ---
# You can rerun the pipeline with different 'available_models', 'aggregation_strategy', or 'max_length'.
# For demonstration, let's run with a general-domain model for comparison:

other_model_key = 'distilbert-base-uncased-finetuned-ner'
other_model_info = available_models[other_model_key]
print(f"
Comparison: Running with control model: {other_model_info['model_name']}")
# Instantiate the comparison (general) NER pipeline
othe_ner_pipe = pipeline('ner', model=other_model_info['model_name'], tokenizer=other_model_info['model_name'], aggregation_strategy=aggregation_strategy)
# Extract entities using the control model and add a new column
clinical_notes_df['entities_control'] = None  # Populate this column with extract_entities using the general model


def compare_entities(df: pd.DataFrame, limit: int = 5):
    # Instructions:
    # 1. For each row (up to 'limit') in the dataframe:
    #    a. Retrieve both domain-specific and control (general) model entity lists for the note.
    #    b. Construct separate markdown lists of entities for each model, handling the case if no entities are recognized.
    #    c. Use 'text_wrap' for the clinical note.
    #    d. Use IPython display and Markdown to visually compare model outputs, including the model names.
    pass  # Implement the logic above to display a side-by-side comparison of entity extraction across models

# Display comparison results.
display(Markdown('### Model Comparison: Clinical vs General NER Results'))
compare_entities(clinical_notes_df, limit=5)

# --- 7. Documentation: Experimentation & Model Performance Notes ---

experiment_notes = '''
## Experiment Notes: Rapid NLP Pipeline Prototyping

- **Model Selection Impact:**
    - Domain-specific models (e.g., Bio_ClinicalBERT, BioBERT-NER) tend to extract medical concepts (diagnoses, symptoms, medications) more accurately and with appropriate labeling, compared to general NER baselines.
    - General models (like distilbert-base-uncased-finetuned-ner) primarily label entities as PERSON/ORG/LOC/MISC; clinical details may be missed or mis-labeled.

- **Parameter Choices:**
    - `aggregation_strategy` controls entity grouping and can affect granularity. For short input, 'simple' works well; 'none' returns more verbose outputs.
    - `max_length` impacts handling of long clinical notes. For very long notes, split into sentences or adjust max_length accordingly.
    - Some biomedical NER models do not support aggregation_strategy or may have different output formats (check documentation as models evolve).

- **Observations:**
    - The reusable, parameterized pipeline structure allows rapid swapping of models and tuning for best entity extraction quality.
    - Differences in recognized entities and label types are evident when changing from a domain-specific to a general-purpose model.
    - For production, prefer clinical-domain models when available, and always evaluate model performance using a labeled test set.
'''
display(Markdown(experiment_notes))


Analyze and Visualize Synthetic Data Generated via Reusable Notebook

In [None]:
# Synthetic Data Generation, Visualization, and Privacy/Fidelity Analysis for Healthcare

# --- 1. Setup: Imports & Environment ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown
import random

sns.set_theme(style="whitegrid")

# --- 2. Select Data Type to Generate (Tabular or Textual) ---

def select_data_type():
    # INSTRUCTION:
    # Prompt the user to select which kind of synthetic data to generate:
    # - Print options for tabular or textual data.
    # - Solicit input from the user and validate the choice to be either '1' or '2'.
    # - Default to '1' if input is not possible (e.g., in non-interactive execution).
    # - Return the selected choice as a string ('1' for tabular, '2' for text).
    pass

# In notebooks, we can gracefully default to tabular data for reproducibility
try:
    # INSTRUCTION:
    # Call select_data_type() to get user's choice.
    # If selection fails (e.g., in non-interactive mode), default to '1'.
    data_type_choice = select_data_type()
except Exception:
    data_type_choice = '1'

# --- 3A. Generate Synthetic Tabular Healthcare Data ---
def generate_synthetic_tabular(n_samples: int = 300, random_seed: int = 42) -> pd.DataFrame:
    # INSTRUCTION:
    # - Set the random seed for reproducibility using random_seed parameter.
    # - Generate a dictionary containing columns for synthetic healthcare dataset:
    #   - patient_id: Sequence numbers from 1 to n_samples.
    #   - age: Normally distributed integers, centered around an average age (e.g., 57) with suitable variance.
    #   - sex: Random choices between 'Male' and 'Female'.
    #   - admission_type: Random selection among 'Emergency', 'Elective', 'Urgent' with specific probabilities.
    #   - systolic_bp, diastolic_bp, glucose: Normally distributed values with realistic means and standard deviations.
    #   - length_of_stay: Generate with normal distribution and take absolute value to avoid negatives.
    #   - discharge_status: Categorical with realistic distribution (e.g., mostly 'Home').
    # - Create a pandas DataFrame from the above dictionary.
    # - Post-process columns:
    #   - Clip 'age' to be within realistic range (e.g., 0-100).
    #   - Round blood pressures, glucose, and length_of_stay to appropriate decimal precision.
    # - Return the resulting DataFrame.
    pass

# --- 3B. Generate Synthetic Clinical Notes (Textual) ---
def generate_synthetic_notes(n_samples: int = 10, random_seed: int = 42) -> pd.DataFrame:
    # INSTRUCTION:
    # - Set the random seed using random_seed for reproducibility.
    # - Define lists for diseases, medications, findings, diagnostics, and a variety of sentence templates (with placeholders).
    # - For each note (from 0 to n_samples-1):
    #   - Randomly pick a template.
    #   - Replace placeholders (e.g., [finding], [disease1], [med1], etc) with randomly chosen items from the corresponding lists.
    #   - Account for templates that use curly-brace placeholders (e.g., '{diagnostic}') and replace them accordingly.
    #   - Append the constructed note to a notes list.
    # - Create a DataFrame with columns 'note_id' (1-based) and 'note_text'.
    # - Return the DataFrame.
    pass

# --- 4. Load or Generate (Optionally Also Load 'real' Data for Comparison) ---
# For this demonstration, we'll treat the previous EDA dataset as 'real' data for tabular comparison.

if data_type_choice == '1':
    # Tabular synthetic data
    # INSTRUCTION:
    # - Use generate_synthetic_tabular() to create synthetic_df with 300 samples.
    # - Attempt to copy a variable health_df as the 'real' data.
    # - If health_df is unavailable, generate a new DataFrame using a different seed for real_df.
    # - Display the first 6 rows of synthetic_df in a markdown cell.
    pass
else:
    # Synthetic textual data
    # INSTRUCTION:
    # - Use generate_synthetic_notes() to generate 8 synthetic clinical notes in synthetic_df.
    # - Display the first 5 notes in a markdown cell.
    pass

# --- 5. Visualize and Compare Distributions (Tabular Data) ---

def plot_distribution_compare(syn: pd.DataFrame, real: pd.DataFrame, column: str, bins: int = 20, title: str = None):
    # INSTRUCTION:
    # - Create an overlayed histogram comparison for the given column using the 'real' and 'synthetic' DataFrames.
    # - Use sns.histplot for both datasets, specifying different colors and transparency.
    # - Add density curves, legend, labels, title, and tight layout.
    # - Show the plot.
    pass

def plot_categorical_compare(syn: pd.DataFrame, real: pd.DataFrame, column: str, title: str = None):
    # INSTRUCTION:
    # - Compute normalized value counts for the specified categorical column in both synthetic and real DataFrames.
    # - Plot side-by-side bar charts for visual comparison using plt.bar.
    # - Add axis labels, legend, title, and tight layout.
    # - Show the plot.
    pass

if data_type_choice == '1':
    # INSTRUCTION:
    # - For each continuous column (e.g., 'age', 'systolic_bp', 'diastolic_bp', 'glucose', 'length_of_stay'):
    #   - Call plot_distribution_compare with synthetic and real DataFrames.
    # - For each categorical column (e.g., 'sex', 'admission_type', 'discharge_status'):
    #   - Call plot_categorical_compare with synthetic and real DataFrames.
    pass

# --- 6. NLP and Statistics for Synthetic Textual Data ---
if data_type_choice == '2':
    # INSTRUCTION:
    # - For synthetic_df, compute simple statistics on text fields:
    #   - Add columns with word counts and character counts per note.
    #   - Display markdown headers and descriptive statistics tables for these fields.
    #   - Calculate overall vocabulary size and the most common words across all notes.
    #   - Display these statistics with markdown.
    #   - Plot histogram of note lengths (words per note) using sns.histplot.
    pass

# --- 7. Privacy & Utility Discussion (Markdown cell) ---
report_md = '''
## Utility and Privacy of Generated Synthetic Data

- **Utility:**
    - Synthetic tabular data imitates real patient distributions and enables rapid prototyping for research, visualization, or algorithmic testing without patient privacy risk.
    - The synthetic clinical notes reflect plausible combinations of medical concepts, supporting NLP model testing and iterative prompt engineering.
    - Visual comparison demonstrates good alignment in/univariate statistics; for deeper fidelity, advanced methods (e.g. GANs, copulas, or language models) may be considered.

- **Privacy:**
    - All data is programmatically generated, guaranteeing that no individually-identifying patient information is present.
    - Distributional or summary-level attacks are not meaningful (there is no one-to-one mapping with actual patients).
    - For deployment in sensitive settings, privacy-preserving approaches (differential privacy, noise injection, output audits) can further strengthen guarantees.

**Conclusion:**

This notebook supports safe, rapid synthetic healthcare data prototyping, visualization, and model pipeline validation, with explicit separation from true patient data. Researchers should always validate downstream analysis pipelines for generalizability beyond synthetic benchmarks.
'''
display(Markdown(report_md))


Integrated Workflow: Rapid Prototyping Chain of EDA → NLP → Synthetic Data in Healthcare

In [None]:
# # Rapid Prototyping: Chained Healthcare AI Workflow (EDA â NLP â Synthetic Data)
# 
# This notebook demonstrates chaining several reusable notebook components for rapid prototyping on healthcare data:
# - Structured EDA
# - Application of clinical NLP (named entity recognition, information extraction)
# - Generation and validation of synthetic data
#
# Your task is to re-implement the logic for each step, leveraging the existing variable names and imported libraries. Replace the logic in each code block below with your implementation following the provided instructions.

from IPython.display import Markdown, display, HTML

# --- 1. Recap: EDA Preview ---
display(Markdown("## ð© Step 1: EDA â Real Healthcare Data Preview"))
# INSTRUCTIONS:
# 1. Display the first 5 rows of the `health_df` DataFrame to preview the data structure.
# 2. Present a descriptive statistical summary (e.g., using `.describe()` with `include='all'`) of `health_df`, showing basic stats and field types.
# 3. If `health_df` is not defined or there is an error, display a Markdown message informing the user to rerun the respective EDA cell.
# (Use `try`/`except` blocks as necessary.)

# --- 2. Recap: Clinical NLP on Example Notes ---
display(Markdown("## ð© Step 2: NLP â Information Extraction from Clinical Notes"))
# INSTRUCTIONS:
# 1. Display the first 3 rows of the `clinical_notes_df` DataFrame to show sample clinical notes.
# 2. For these 3 notes, print extracted named entities found in an 'entities' column:
#    - For each entity, show its label/group, the extracted word(s), and confidence score (if present).
#    - If no entities are found for a note, output a suitable message.
# 3. If `clinical_notes_df` is not available or any error occurs, display a Markdown message to rerun the respective NLP cell.
# (Use iteration, HTML/Markdown display, and proper error handling as shown.)

# --- 3. Recap: Synthetic Data & Distribution Comparison ---
display(Markdown("## ð© Step 3: Synthetic Data â Generate & Visualize"))
# INSTRUCTIONS:
# 1. Preview the synthetic data by displaying the first 5 rows of `synthetic_df`.
# 2. Create a comparative histogram of a key numeric column (e.g., 'age') in both `health_df` and `synthetic_df` to visualize distribution similarity.
#    - Overlay both histograms with distinct colors and labels, showing densities (not raw counts), and include legend, axis labels, and appropriate title.
# 3. If `synthetic_df` or required features are missing, display a Markdown error message guiding to rerun the synthetic data cell.
# (Use matplotlib/seaborn and handle plot rendering.)

# --- 4. Workflow Summary & Prototype Value (Markdown) ---
summary_md = '''
## ð Workflow Summary: Rapid AI Healthcare Experimentation

This notebook chains **EDA â Clinical NLP â Synthetic Data Generation** in modular, reusable steps:

- **Exploratory Data Analysis:**
    - Key dataset properties and plausible statistical distributions are surveyed first, identifying data issues and likely modeling features.
- **NLP Pipeline:**
    - Transformer-based entity recognition extracts clinical terms from de-identified text; model swaps and tuning are rapid for improved accuracy.
- **Synthetic Data Generation:**
    - Privacy-compliant, structurally-matched synthetic records are quickly produced and compared distributionally to the original data.

**Benefits:**
- Dramatically cuts iteration times by allowing quick âplug-and-playâ experimentation.
- Supports safe algorithm development and sharing of notebooks.
- Every component supports independent reuse/extension for future, more advanced workflows (e.g., federated learning, multimodal fusion).
- Integrated visualizations and markdown provide immediate insight at each stage.

_This approach enabled chaining three standard healthcare pipelines into a single afternoon workflow, with integrated quality checks, privacy compliance, and easy model switching. Suitable for early-stage discovery, regulatory demos, or preliminary collaborative screening of new ideas._
'''
display(Markdown(summary_md))
# INSTRUCTIONS:
# 1. No custom logic is needed here. Ensure `summary_md` Markdown text is displayed at the end, as shown above, to summarize the workflow.
