In [1]:
"""
Healthcare Analytics - Comprehensive Patient Data Analysis
=========================================================

Author: [Your Name]
Date: [Date]
Dataset: 55,500 patient records
Domain: Healthcare Analytics

Modules:
1. Time Series Analysis - Admission patterns and seasonal trends
2. Machine Learning - Length of stay and cost predictions
3. Network Analysis - Healthcare ecosystem relationships
4. Dashboard Creation - Executive and operational dashboards
"""
# Healthcare Dataset Analytics - Advanced Modules (55,500 patients)
# Real Kaggle Dataset: healthcare_dataset.csv
# Modules: Time Series Analysis, Machine Learning, Network Analysis, Dashboard Creation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

print("🏥 HEALTHCARE DATASET ANALYTICS")
print("=" * 70)
print("📊 Dataset: healthcare_dataset.csv (55,500 patients)")
print("🎯 Modules: Time Series, ML, Network Analysis, Dashboards")
print("🔬 Domain: Real-world Hospital Management Analytics")

# ==========================================
# LOADING & EXPLORING REAL HEALTHCARE DATA
# ==========================================

print("\n📊 LOADING REAL HEALTHCARE DATASET")
print("-" * 50)

try:
    # Load the actual healthcare dataset
    df_healthcare = pd.read_csv('healthcare_dataset.csv')
    print("✅ Successfully loaded healthcare_dataset.csv!")
    print(f"📈 Dataset size: {df_healthcare.shape[0]:,} patients, {df_healthcare.shape[1]} columns")
    
except FileNotFoundError:
    print("❌ Error: healthcare_dataset.csv not found!")
    print("Please ensure healthcare_dataset.csv is in the same folder as this script")
    print("You can download it from Kaggle healthcare datasets")
    exit()

# Display basic information about the dataset
print(f"\n🔍 DATASET OVERVIEW:")
print(f"• Total patients: {len(df_healthcare):,}")
print(f"• Columns: {list(df_healthcare.columns)}")

# Clean column names (remove spaces, standardize)
df_healthcare.columns = df_healthcare.columns.str.strip().str.replace(' ', '_').str.lower()
print(f"\n📋 CLEANED COLUMN NAMES:")
print(list(df_healthcare.columns))

# Display first few rows
print(f"\n👀 FIRST 5 PATIENTS:")
print(df_healthcare.head())

# Basic data quality check
print(f"\n🔍 DATA QUALITY OVERVIEW:")
print(f"• Dataset shape: {df_healthcare.shape}")
print(f"• Missing values:")
missing_data = df_healthcare.isnull().sum()
for col, missing in missing_data.items():
    if missing > 0:
        print(f"  - {col}: {missing:,} missing ({missing/len(df_healthcare)*100:.1f}%)")
    
if missing_data.sum() == 0:
    print("  ✅ No missing values found!")

# Basic statistics
print(f"\n📊 BASIC STATISTICS:")
print(f"• Age range: {df_healthcare['age'].min()} - {df_healthcare['age'].max()} years")
print(f"• Average age: {df_healthcare['age'].mean():.1f} years")
print(f"• Gender distribution:")
gender_dist = df_healthcare['gender'].value_counts()
for gender, count in gender_dist.items():
    print(f"  - {gender}: {count:,} ({count/len(df_healthcare)*100:.1f}%)")

print(f"• Medical conditions:")
condition_counts = df_healthcare['medical_condition'].value_counts()
print(f"  - Unique conditions: {len(condition_counts)}")
print(f"  - Most common: {condition_counts.index[0]} ({condition_counts.iloc[0]:,} cases)")

# ==========================================
# MODULE 1: TIME SERIES ANALYSIS
# ==========================================

print(f"\n\n📈 MODULE 1: TIME SERIES ANALYSIS")
print("-" * 50)

print("🕐 ANALYZING HOSPITAL ADMISSIONS OVER TIME")

# Convert date columns to datetime
df_healthcare['date_of_admission'] = pd.to_datetime(df_healthcare['date_of_admission'])
df_healthcare['discharge_date'] = pd.to_datetime(df_healthcare['discharge_date'])

# Calculate length of stay
df_healthcare['length_of_stay'] = (df_healthcare['discharge_date'] - df_healthcare['date_of_admission']).dt.days

# Extract time components
df_healthcare['admission_year'] = df_healthcare['date_of_admission'].dt.year
df_healthcare['admission_month'] = df_healthcare['date_of_admission'].dt.month
df_healthcare['admission_day_of_week'] = df_healthcare['date_of_admission'].dt.day_name()
df_healthcare['admission_quarter'] = df_healthcare['date_of_admission'].dt.quarter

print(f"✅ Processed temporal data")
print(f"📅 Date range: {df_healthcare['date_of_admission'].min().date()} to {df_healthcare['date_of_admission'].max().date()}")
print(f"⏱️ Average length of stay: {df_healthcare['length_of_stay'].mean():.1f} days")
print(f"⏱️ Maximum length of stay: {df_healthcare['length_of_stay'].max()} days")

# Daily admission trends
daily_admissions = df_healthcare.groupby(df_healthcare['date_of_admission'].dt.date).size()
print(f"\n📊 DAILY ADMISSION PATTERNS:")
print(f"• Average daily admissions: {daily_admissions.mean():.1f}")
print(f"• Peak daily admissions: {daily_admissions.max()}")
print(f"• Minimum daily admissions: {daily_admissions.min()}")

# Monthly trends
monthly_admissions = df_healthcare.groupby('admission_month').size()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
print(f"\n📅 MONTHLY ADMISSION PATTERNS:")
for month, count in monthly_admissions.items():
    print(f"• {month_names[month-1]}: {count:,} admissions")

# Day of week patterns
weekly_admissions = df_healthcare['admission_day_of_week'].value_counts()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
print(f"\n📅 WEEKLY ADMISSION PATTERNS:")
for day in day_order:
    if day in weekly_admissions.index:
        count = weekly_admissions[day]
        print(f"• {day}: {count:,} admissions ({count/len(df_healthcare)*100:.1f}%)")

# Seasonal analysis by medical condition
print(f"\n🌡️ SEASONAL PATTERNS BY MEDICAL CONDITION:")
seasonal_conditions = df_healthcare.groupby(['admission_quarter', 'medical_condition']).size().unstack(fill_value=0)
for condition in seasonal_conditions.columns[:5]:  # Top 5 conditions
    quarter_pattern = seasonal_conditions[condition]
    peak_quarter = quarter_pattern.idxmax()
    print(f"• {condition}: Peak in Q{peak_quarter} ({quarter_pattern[peak_quarter]:,} cases)")


🏥 HEALTHCARE DATASET ANALYTICS - ADVANCED MODULES
📊 Dataset: healthcare_dataset.csv (55,500 patients)
🎯 Modules: Time Series, ML, Network Analysis, Dashboards
🔬 Domain: Real-world Hospital Management Analytics

📊 LOADING REAL HEALTHCARE DATASET
--------------------------------------------------
✅ Successfully loaded healthcare_dataset.csv!
📈 Dataset size: 55,500 patients, 15 columns

🔍 DATASET OVERVIEW:
• Total patients: 55,500
• Columns: ['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition', 'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider', 'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date', 'Medication', 'Test Results']

📋 CLEANED COLUMN NAMES:
['name', 'age', 'gender', 'blood_type', 'medical_condition', 'date_of_admission', 'doctor', 'hospital', 'insurance_provider', 'billing_amount', 'room_number', 'admission_type', 'discharge_date', 'medication', 'test_results']

👀 FIRST 5 PATIENTS:
            name  age  gender blood_type medical_condition da

In [13]:
# ==========================================
# MODULE 2: MACHINE LEARNING
# ==========================================

print(f"🤖 MODULE 2: MACHINE LEARNING - PREDICTING HEALTHCARE OUTCOMES")
print("-" * 60)

print("🎯 BUILDING PREDICTIVE MODELS FOR HEALTHCARE MANAGEMENT")

# Prepare features for machine learning
print(f"\n🔧 FEATURE ENGINEERING:")

# Create a copy for ML operations
df_ml = df_healthcare.copy()

# Handle potential column name variations and clean data
print(f"Original columns: {list(df_ml.columns)}")

# Ensure we have the required columns
required_columns = ['age', 'gender', 'medical_condition', 'billing_amount', 'length_of_stay']
missing_columns = [col for col in required_columns if col not in df_ml.columns]

if missing_columns:
    print(f"❌ Missing required columns: {missing_columns}")
    print("Available columns:", list(df_ml.columns))
    # Skip ML module if critical columns are missing
    print("⚠️ Skipping ML module due to missing columns")
else:
    # Clean and prepare data
    # Remove any rows with missing critical values
    df_ml = df_ml.dropna(subset=['age', 'billing_amount', 'length_of_stay'])
    
    # Encode categorical variables safely
    label_encoders = {}
    
    # Gender encoding
    if 'gender' in df_ml.columns:
        le_gender = LabelEncoder()
        df_ml['gender_encoded'] = le_gender.fit_transform(df_ml['gender'].astype(str))
        label_encoders['gender'] = le_gender
        print(f"✅ Encoded gender: {len(le_gender.classes_)} unique values")
    
    # Medical condition encoding
    if 'medical_condition' in df_ml.columns:
        le_condition = LabelEncoder()
        df_ml['medical_condition_encoded'] = le_condition.fit_transform(df_ml['medical_condition'].astype(str))
        label_encoders['medical_condition'] = le_condition
        print(f"✅ Encoded medical_condition: {len(le_condition.classes_)} unique values")
    
    # Admission type encoding (if available)
    if 'admission_type' in df_ml.columns:
        le_admission = LabelEncoder()
        df_ml['admission_type_encoded'] = le_admission.fit_transform(df_ml['admission_type'].astype(str))
        label_encoders['admission_type'] = le_admission
        print(f"✅ Encoded admission_type: {len(le_admission.classes_)} unique values")
    
    # Create target variables for different prediction tasks
    print(f"\n📊 CREATING TARGET VARIABLES:")
    
    # 1. Length of Stay Prediction (Classification: Short/Medium/Long stay)
    # Handle edge cases in length of stay
    df_ml['length_of_stay'] = df_ml['length_of_stay'].clip(lower=0, upper=365)  # Cap at 1 year
    
    # Create stay categories with more robust binning
    df_ml['stay_category'] = pd.cut(df_ml['length_of_stay'], 
                                   bins=[-0.1, 3, 7, float('inf')], 
                                   labels=['Short', 'Medium', 'Long'],
                                   include_lowest=True)
    
    # 2. High Cost Prediction (above median billing amount)
    # Clean billing amount data
    df_ml['billing_amount'] = pd.to_numeric(df_ml['billing_amount'], errors='coerce')
    df_ml = df_ml.dropna(subset=['billing_amount'])
    
    median_cost = df_ml['billing_amount'].median()
    df_ml['high_cost'] = (df_ml['billing_amount'] > median_cost).astype(int)
    
    print(f"✅ Length of stay categories:")
    stay_dist = df_ml['stay_category'].value_counts()
    for category, count in stay_dist.items():
        print(f"  - {category}: {count:,} patients ({count/len(df_ml)*100:.1f}%)")
    
    print(f"✅ High cost patients: {df_ml['high_cost'].sum():,} ({df_ml['high_cost'].mean():.1%})")
    print(f"✅ Cost threshold: ${median_cost:,.2f}")
    
    # PREDICTION TASK 1: Length of Stay Prediction
    print(f"\n🎯 PREDICTION TASK 1: LENGTH OF STAY")
    print("-" * 40)
    
    # Select available features for length of stay prediction
    potential_los_features = ['age', 'gender_encoded', 'medical_condition_encoded', 'admission_type_encoded']
    los_features = [f for f in potential_los_features if f in df_ml.columns]
    
    if len(los_features) >= 2:  # Need at least 2 features
        # Prepare data
        X_los = df_ml[los_features].copy()
        y_los = df_ml['stay_category'].copy()
        
        # Remove any remaining NaN values
        mask = ~(X_los.isnull().any(axis=1) | y_los.isnull())
        X_los = X_los[mask]
        y_los = y_los[mask]
        
        print(f"Features used: {los_features}")
        print(f"Sample size: {len(X_los):,} patients")
        
        if len(X_los) > 100:  # Ensure we have enough data
            try:
                # Train-test split
                X_train_los, X_test_los, y_train_los, y_test_los = train_test_split(
                    X_los, y_los, test_size=0.2, random_state=42, stratify=y_los
                )
                
                # Train Random Forest model for length of stay
                rf_los = RandomForestClassifier(n_estimators=50, random_state=42, max_depth=10)
                rf_los.fit(X_train_los, y_train_los)
                
                # Make predictions
                y_pred_los = rf_los.predict(X_test_los)
                accuracy_los = accuracy_score(y_test_los, y_pred_los)
                
                print(f"\n📊 LENGTH OF STAY PREDICTION RESULTS:")
                print(f"• Accuracy: {accuracy_los:.3f}")
                print(f"• Training samples: {len(X_train_los):,}")
                print(f"• Test samples: {len(X_test_los):,}")
                
                # Feature importance
                feature_importance_los = pd.DataFrame({
                    'feature': los_features,
                    'importance': rf_los.feature_importances_
                }).sort_values('importance', ascending=False)
                
                print(f"\n🔍 MOST IMPORTANT FACTORS FOR LENGTH OF STAY:")
                for _, row in feature_importance_los.iterrows():
                    print(f"   {row['feature']:<25}: {row['importance']:.3f}")
                    
            except Exception as e:
                print(f"❌ Error in Length of Stay prediction: {str(e)}")
                print("Continuing with next prediction task...")
        else:
            print("❌ Insufficient data for Length of Stay prediction")
    else:
        print("❌ Insufficient features for Length of Stay prediction")
    
    # PREDICTION TASK 2: High Cost Prediction
    print(f"\n🎯 PREDICTION TASK 2: HIGH COST PREDICTION")
    print("-" * 40)
    
    # Select available features for cost prediction
    potential_cost_features = ['age', 'gender_encoded', 'medical_condition_encoded', 'length_of_stay']
    cost_features = [f for f in potential_cost_features if f in df_ml.columns]
    
    if len(cost_features) >= 2:  # Need at least 2 features
        # Prepare data
        X_cost = df_ml[cost_features].copy()
        y_cost = df_ml['high_cost'].copy()
        
        # Remove NaN values
        mask = ~(X_cost.isnull().any(axis=1) | y_cost.isnull())
        X_cost = X_cost[mask]
        y_cost = y_cost[mask]
        
        print(f"Features used: {cost_features}")
        print(f"Sample size: {len(X_cost):,} patients")
        
        if len(X_cost) > 100:  # Ensure we have enough data
            try:
                # Train-test split
                X_train_cost, X_test_cost, y_train_cost, y_test_cost = train_test_split(
                    X_cost, y_cost, test_size=0.2, random_state=42, stratify=y_cost
                )
                
                # Train Logistic Regression for cost prediction
                scaler = StandardScaler()
                X_train_cost_scaled = scaler.fit_transform(X_train_cost)
                X_test_cost_scaled = scaler.transform(X_test_cost)
                
                lr_cost = LogisticRegression(random_state=42, max_iter=1000)
                lr_cost.fit(X_train_cost_scaled, y_train_cost)
                
                # Make predictions
                y_pred_cost = lr_cost.predict(X_test_cost_scaled)
                y_pred_cost_proba = lr_cost.predict_proba(X_test_cost_scaled)[:, 1]
                accuracy_cost = accuracy_score(y_test_cost, y_pred_cost)
                
                print(f"\n📊 HIGH COST PREDICTION RESULTS:")
                print(f"• Accuracy: {accuracy_cost:.3f}")
                print(f"• High-cost patients identified: {y_pred_cost.sum():,}")
                
                # Safe precision calculation
                true_positives = (y_pred_cost & y_test_cost).sum()
                predicted_positives = y_pred_cost.sum()
                
                if predicted_positives > 0:
                    precision = true_positives / predicted_positives
                    print(f"• Precision: {precision:.3f}")
                else:
                    print(f"• Precision: No positive predictions made")
                    
            except Exception as e:
                print(f"❌ Error in High Cost prediction: {str(e)}")
                print("Model training completed with warnings...")
        else:
            print("❌ Insufficient data for High Cost prediction")
    else:
        print("❌ Insufficient features for High Cost prediction")

🤖 MODULE 2: MACHINE LEARNING - PREDICTING HEALTHCARE OUTCOMES
------------------------------------------------------------
🎯 BUILDING PREDICTIVE MODELS FOR HEALTHCARE MANAGEMENT

🔧 FEATURE ENGINEERING:
Original columns: ['name', 'age', 'gender', 'blood_type', 'medical_condition', 'date_of_admission', 'doctor', 'hospital', 'insurance_provider', 'billing_amount', 'room_number', 'admission_type', 'discharge_date', 'medication', 'test_results', 'length_of_stay', 'admission_year', 'admission_month', 'admission_day_of_week', 'admission_quarter']
✅ Encoded gender: 2 unique values
✅ Encoded medical_condition: 6 unique values
✅ Encoded admission_type: 3 unique values

📊 CREATING TARGET VARIABLES:
✅ Length of stay categories:
  - Long: 42,514 patients (76.6%)
  - Medium: 7,484 patients (13.5%)
  - Short: 5,502 patients (9.9%)
✅ High cost patients: 27,750 (50.0%)
✅ Cost threshold: $25,538.07

🎯 PREDICTION TASK 1: LENGTH OF STAY
----------------------------------------
Features used: ['age', 'gend

In [11]:
# ==========================================
# MODULE 3: NETWORK ANALYSIS
# ==========================================

print(f"🕸️  MODULE 3: NETWORK ANALYSIS - HEALTHCARE RELATIONSHIPS")
print("-" * 60)

print("🔗 ANALYZING RELATIONSHIPS IN HEALTHCARE ECOSYSTEM")

# Doctor-Hospital Network Analysis
print(f"\n👨‍⚕️ DOCTOR-HOSPITAL NETWORK:")
doctor_hospital = df_healthcare.groupby(['doctor', 'hospital']).size().reset_index(name='patient_count')
print(f"• Total doctor-hospital relationships: {len(doctor_hospital):,}")
print(f"• Average patients per doctor-hospital pair: {doctor_hospital['patient_count'].mean():.1f}")

# Most collaborative doctors (work with multiple hospitals)
doctor_hospitals = df_healthcare.groupby('doctor')['hospital'].nunique().sort_values(ascending=False)
print(f"\n🔗 MOST COLLABORATIVE DOCTORS (multiple hospitals):")
for doctor, hospital_count in doctor_hospitals.head(5).items():
    patient_count = len(df_healthcare[df_healthcare['doctor'] == doctor])
    print(f"   {doctor}: {hospital_count} hospitals, {patient_count:,} patients")

# Hospital-Condition Network
print(f"\n🏥 HOSPITAL-CONDITION SPECIALIZATION:")
hospital_conditions = df_healthcare.groupby(['hospital', 'medical_condition']).size().unstack(fill_value=0)
hospital_specialization = hospital_conditions.div(hospital_conditions.sum(axis=1), axis=0)

# Find each hospital's top specialization
print(f"Top specializations by hospital:")
for hospital in hospital_specialization.index[:5]:  # First 5 hospitals
    top_condition = hospital_specialization.loc[hospital].idxmax()
    specialization_rate = hospital_specialization.loc[hospital, top_condition]
    patient_count = hospital_conditions.loc[hospital, top_condition]
    print(f"   {hospital[:30]:<30}: {top_condition} ({specialization_rate:.1%}, {patient_count} patients)")

# Insurance-Cost Network Analysis
print(f"\n💰 INSURANCE-COST ANALYSIS:")
insurance_stats = df_healthcare.groupby('insurance_provider').agg({
    'billing_amount': ['count', 'mean', 'median', 'std']
}).round(2)
insurance_stats.columns = ['Patient_Count', 'Avg_Cost', 'Median_Cost', 'Cost_StdDev']
insurance_stats = insurance_stats.sort_values('Patient_Count', ascending=False)

print(f"Top insurance providers by patient volume:")
for idx, (provider, row) in enumerate(insurance_stats.head(5).iterrows()):
    print(f"   {idx+1}. {provider[:25]:<25}: {row['Patient_Count']:,} patients, Avg: ${row['Avg_Cost']:,.2f}")

# Medication-Condition Network
print(f"\n💊 MEDICATION-CONDITION RELATIONSHIPS:")
med_condition = df_healthcare.groupby(['medication', 'medical_condition']).size().reset_index(name='frequency')
med_condition = med_condition.sort_values('frequency', ascending=False)

print(f"Most common medication-condition pairs:")
for _, row in med_condition.head(5).iterrows():
    print(f"   {row['medication']} → {row['medical_condition']}: {row['frequency']:,} cases")

# Network Density Metrics
print(f"\n📊 NETWORK METRICS:")
unique_doctors = df_healthcare['doctor'].nunique()
unique_hospitals = df_healthcare['hospital'].nunique()
unique_conditions = df_healthcare['medical_condition'].nunique()
unique_medications = df_healthcare['medication'].nunique()

print(f"• Network nodes:")
print(f"  - Doctors: {unique_doctors:,}")
print(f"  - Hospitals: {unique_hospitals:,}")
print(f"  - Conditions: {unique_conditions:,}")
print(f"  - Medications: {unique_medications:,}")

print(f"• Network density:")
print(f"  - Doctor-Hospital connections: {len(doctor_hospital):,}")
print(f"  - Possible connections: {unique_doctors * unique_hospitals:,}")
print(f"  - Network density: {len(doctor_hospital)/(unique_doctors * unique_hospitals):.3f}")


🕸️  MODULE 3: NETWORK ANALYSIS - HEALTHCARE RELATIONSHIPS
------------------------------------------------------------
🔗 ANALYZING RELATIONSHIPS IN HEALTHCARE ECOSYSTEM

👨‍⚕️ DOCTOR-HOSPITAL NETWORK:
• Total doctor-hospital relationships: 50,000
• Average patients per doctor-hospital pair: 1.1

🔗 MOST COLLABORATIVE DOCTORS (multiple hospitals):
   Michael Smith: 24 hospitals, 27 patients
   John Smith: 21 hospitals, 22 patients
   Robert Smith: 19 hospitals, 22 patients
   Michael Johnson: 19 hospitals, 20 patients
   David Smith: 18 hospitals, 19 patients

🏥 HOSPITAL-CONDITION SPECIALIZATION:
Top specializations by hospital:
   Abbott Inc                    : Arthritis (100.0%, 1 patients)
   Abbott Ltd                    : Arthritis (100.0%, 1 patients)
   Abbott Moore and Williams,    : Obesity (100.0%, 1 patients)
   Abbott and Thompson, Sullivan : Hypertension (100.0%, 1 patients)
   Abbott, Peters and Hoffman    : Diabetes (100.0%, 2 patients)

💰 INSURANCE-COST ANALYSIS:
Top insu

In [19]:
# ==========================================
# MODULE 4: DASHBOARD CREATION
# ==========================================

print(f"📊 MODULE 4: HEALTHCARE MANAGEMENT DASHBOARD")
print("-" * 60)

print("📈 COMPREHENSIVE HOSPITAL MANAGEMENT KPIs")

# Main Dashboard KPIs
total_patients = len(df_healthcare)
total_revenue = df_healthcare['billing_amount'].sum()
avg_revenue_per_patient = df_healthcare['billing_amount'].mean()
avg_length_of_stay = df_healthcare['length_of_stay'].mean()
bed_occupancy_days = df_healthcare['length_of_stay'].sum()

print(f"\n🏥 HOSPITAL DASHBOARD - EXECUTIVE SUMMARY")
print(f"{'='*60}")
print(f"👥 Total Patients Served:      {total_patients:,}")
print(f"💰 Total Revenue Generated:    ${total_revenue:,.2f}")
print(f"💳 Average Revenue per Patient: ${avg_revenue_per_patient:,.2f}")
print(f"⏱️  Average Length of Stay:     {avg_length_of_stay:.1f} days")
print(f"🛏️  Total Bed Occupancy Days:   {bed_occupancy_days:,}")
print(f"📅 Data Period:                {df_healthcare['date_of_admission'].min().date()} to {df_healthcare['date_of_admission'].max().date()}")

# Department Performance Dashboard
print(f"\n🏥 DEPARTMENT PERFORMANCE (by Medical Condition)")
print("-" * 50)
dept_performance = df_healthcare.groupby('medical_condition').agg({
    'name': 'count',  # Use the first column as patient count proxy
    'billing_amount': ['sum', 'mean'],
    'length_of_stay': 'mean'
}).round(2)

dept_performance.columns = ['Patient_Count', 'Total_Revenue', 'Avg_Revenue', 'Avg_Stay']
dept_performance = dept_performance.sort_values('Patient_Count', ascending=False)

print(f"{'Department':<15} | {'Patients':<8} | {'Total Revenue':<12} | {'Avg Revenue':<11} | {'Avg Stay':<8}")
print("-" * 70)
for condition, row in dept_performance.head(10).iterrows():
    print(f"{condition[:14]:<15} | {row['Patient_Count']:>8,} | ${row['Total_Revenue']:>11,.0f} | ${row['Avg_Revenue']:>10,.0f} | {row['Avg_Stay']:>7.1f}d")

# Financial Dashboard
print(f"\n💰 FINANCIAL PERFORMANCE DASHBOARD")
print("-" * 40)

# Revenue by insurance provider
insurance_revenue = df_healthcare.groupby('insurance_provider').agg({
    'billing_amount': ['sum', 'count', 'mean']
}).round(2)
insurance_revenue.columns = ['Total_Revenue', 'Patient_Count', 'Avg_Revenue']
insurance_revenue = insurance_revenue.sort_values('Total_Revenue', ascending=False)

print(f"Revenue by Insurance Provider (Top 5):")
for provider, row in insurance_revenue.head(5).iterrows():
    print(f"• {provider[:25]:<25}: ${row['Total_Revenue']:>12,.2f} ({row['Patient_Count']:,} patients)")

# Cost analysis by admission type
print(f"\n🚨 ADMISSION TYPE ANALYSIS:")
admission_analysis = df_healthcare.groupby('admission_type').agg({
    'billing_amount': ['count', 'mean', 'sum'],
    'length_of_stay': 'mean'
}).round(2)
admission_analysis.columns = ['Cases', 'Avg_Cost', 'Total_Revenue', 'Avg_Stay']

for admission_type, row in admission_analysis.iterrows():
    print(f"• {admission_type:<12}: {row['Cases']:>6,} cases, ${row['Avg_Cost']:>8,.2f} avg, {row['Avg_Stay']:>5.1f}d stay")

# Operational Dashboard
print(f"\n⚙️ OPERATIONAL METRICS DASHBOARD")
print("-" * 40)

# Bed utilization by hospital
hospital_utilization = df_healthcare.groupby('hospital').agg({
    'length_of_stay': 'sum',
    'name': 'count'
}).round(1)
hospital_utilization.columns = ['Total_Bed_Days', 'Total_Patients']
hospital_utilization['Avg_Stay'] = hospital_utilization['Total_Bed_Days'] / hospital_utilization['Total_Patients']
hospital_utilization = hospital_utilization.sort_values('Total_Bed_Days', ascending=False)

print(f"Hospital Utilization (Top 5):")
for hospital, row in hospital_utilization.head(5).iterrows():
    print(f"• {hospital[:25]:<25}: {row['Total_Bed_Days']:>6,.0f} bed-days, {row['Total_Patients']:>5,} patients")

# Quality Metrics Dashboard
print(f"\n📊 QUALITY & EFFICIENCY METRICS")
print("-" * 40)

# Readmission risk analysis (patients with multiple records)
patient_visits = df_healthcare['name'].value_counts()
multiple_visits = patient_visits[patient_visits > 1]

print(f"Patient Visit Patterns:")
print(f"• Unique patients: {len(patient_visits):,}")
print(f"• Patients with multiple visits: {len(multiple_visits):,} ({len(multiple_visits)/len(patient_visits):.1%})")
print(f"• Average visits per patient: {patient_visits.mean():.2f}")

# Test results analysis
if 'test_results' in df_healthcare.columns:
    test_results_dist = df_healthcare['test_results'].value_counts()
    print(f"\nTest Results Distribution:")
    for result, count in test_results_dist.items():
        print(f"• {result}: {count:,} ({count/len(df_healthcare):.1%})")

# Alert System
print(f"\n🚨 AUTOMATED ALERT SYSTEM")
print("-" * 30)

# High-cost patients alert
high_cost_threshold = df_healthcare['billing_amount'].quantile(0.95)
high_cost_patients = df_healthcare[df_healthcare['billing_amount'] > high_cost_threshold]
print(f"• High-cost patients (>95th percentile): {len(high_cost_patients):,}")
print(f"  Threshold: ${high_cost_threshold:,.2f}")

# Long-stay patients alert
long_stay_threshold = df_healthcare['length_of_stay'].quantile(0.90)
long_stay_patients = df_healthcare[df_healthcare['length_of_stay'] > long_stay_threshold]
print(f"• Long-stay patients (>90th percentile): {len(long_stay_patients):,}")
print(f"  Threshold: {long_stay_threshold:.0f} days")

# Resource planning recommendations
print(f"\n💡 RESOURCE PLANNING RECOMMENDATIONS")
print("-" * 40)
print("Based on the analysis:")

# Peak admission days
peak_day = weekly_admissions.idxmax()
print(f"• Highest admissions on {peak_day}: Plan extra staffing")

# Seasonal patterns
peak_month = monthly_admissions.idxmax()
print(f"• Peak admission month: {month_names[peak_month-1]} - Prepare for increased capacity")

# High-revenue conditions
top_revenue_condition = dept_performance.index[0]
print(f"• Focus on {top_revenue_condition} department - highest patient volume")

print(f"\n" + "="*70)
print("🎉 COMPREHENSIVE HEALTHCARE ANALYTICS COMPLETE!")
print("📊 Successfully analyzed 55,500 patient records")
print("="*70)

📊 MODULE 4: HEALTHCARE MANAGEMENT DASHBOARD
------------------------------------------------------------
📈 COMPREHENSIVE HOSPITAL MANAGEMENT KPIs

🏥 HOSPITAL DASHBOARD - EXECUTIVE SUMMARY
👥 Total Patients Served:      55,500
💰 Total Revenue Generated:    $1,417,432,043.40
💳 Average Revenue per Patient: $25,539.32
⏱️  Average Length of Stay:     15.5 days
🛏️  Total Bed Occupancy Days:   860,750
📅 Data Period:                2019-05-08 to 2024-05-07

🏥 DEPARTMENT PERFORMANCE (by Medical Condition)
--------------------------------------------------
Department      | Patients | Total Revenue | Avg Revenue | Avg Stay
----------------------------------------------------------------------
Arthritis       |  9,308.0 | $237,329,120 | $    25,497 |    15.5d
Diabetes        |  9,304.0 | $238,539,725 | $    25,638 |    15.4d
Hypertension    |  9,245.0 | $235,720,650 | $    25,497 |    15.5d
Obesity         |  9,231.0 | $238,214,921 | $    25,806 |    15.5d
Cancer          |  9,227.0 | $232,167,861