In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Create hospital records
hospitals = ['City General', 'Metro Health', 'Regional Medical', 'Community Care', 'University Hospital']
departments = ['Emergency', 'Cardiology', 'Orthopedics', 'Pediatrics', 'Surgery']
insurance_types = ['Private', 'Medicare', 'Medicaid', 'Uninsured']

# Create 1000 patient records
n_patients = 1000

patient_data = {
    'patient_id': range(1, n_patients + 1),
    'hospital': np.random.choice(hospitals, n_patients),
    'department': np.random.choice(departments, n_patients),
    'age': np.random.randint(18, 85, n_patients),
    'stay_duration': np.random.randint(1, 15, n_patients),
    'insurance_type': np.random.choice(insurance_types, n_patients),
    'admission_cost': np.random.randint(1000, 50000, n_patients)
}

# Display all columns and rows for better visibility
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Create DataFrames
hospitals_df = pd.DataFrame(hospital_data)
locations_df = pd.DataFrame(location_data)

# Save to CSV files
hospitals_df.to_csv('hospitals.csv', index=False)
locations_df.to_csv('locations.csv', index=False)

# Load the CSV files
hospitals = pd.read_csv('hospitals.csv')
locations = pd.read_csv('locations.csv')

EDA

In [None]:
# Create DataFrame
df = pd.DataFrame(patient_data)

len(df)
list(df.columns)
df.columns.tolist()
for i, col in enumerate(df.columns):
    print(f"{i+1}. '{col}'")

df.dtypes
df.shape
df.head(3)
df.info()
df.describe()
df['hospital'].unique()
df.query('your_filter_condition').shape
df.groupby('hospital')['stay_duration'].mean()
df.groupby('hospital')['stay_duration'].agg([
    'count',    # Number of patients
    'mean',     # Average stay
    'median',   # Median stay
    'std',      # Standard deviation
    'min',      # Minimum stay
    'max'       # Maximum stay
]).round(2)
hospital_stats.columns = ['Patient_Count', 'Avg_Stay', 'Median_Stay', 'Std_Stay', 'Min_Stay', 'Max_Stay']
df['stay_category'].value_counts()

Visualization

In [None]:
# Create visualization
plt.figure(figsize=(12, 6))

# Plot 1: Average stay by hospital
plt.subplot(1, 2, 1)
avg_stay_by_hospital.plot(kind='bar', color='skyblue')
plt.title('Average Stay Duration by Hospital')
plt.xlabel('Hospital')
plt.ylabel('Average Stay (Days)')
plt.xticks(rotation=45)
plt.tight_layout()

# Plot 2: Patient count by hospital
plt.subplot(1, 2, 2)
df['hospital'].value_counts().plot(kind='bar', color='lightcoral')
plt.title('Patient Count by Hospital')
plt.xlabel('Hospital')
plt.ylabel('Number of Patients')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

Conditional Filtering

In [None]:
large_hospitals = hospitals[hospitals['bed_count'] > 400]
large_emergency = hospitals[
    (hospitals['bed_count'] > 300) &
    (hospitals['emergency_services'] == True)
]
medium_hospitals = hospitals[
    (hospitals['bed_count'] >= 300) &
    (hospitals['bed_count'] <= 500)
]

print(large_hospitals[['hospital_name', 'bed_count']])
print(large_emergency[['hospital_name', 'bed_count', 'emergency_services']])
print(medium_hospitals[['hospital_name', 'bed_count']])

emergency_hospitals = hospitals[hospitals['emergency_services'] == True]
print(emergency_hospitals[['hospital_name', 'emergency_services']])

large_emergency = hospitals[
    (hospitals['bed_count'] > 300) &
    (hospitals['emergency_services'] == True)
]

print(large_emergency[['hospital_name', 'bed_count', 'emergency_services']])

high_satisfaction = hospitals[hospitals['patient_satisfaction'] > 4.5]
cardiac_or_high_satisfaction = hospitals[
    (hospitals['specialty'] == 'Cardiac') |
    (hospitals['patient_satisfaction'] > 4.5)
]

print(high_satisfaction[['hospital_name', 'patient_satisfaction']])
print(cardiac_or_high_satisfaction[['hospital_name', 'specialty', 'patient_satisfaction']])

String based Filtering

In [None]:
general_hospitals = hospitals[hospitals['hospital_name'].str.contains('General')]
print(general_hospitals[['hospital_name']])

s_hospitals = hospitals[hospitals['hospital_name'].str.startswith('S')]
print("Hospitals starting with 'S':")

cardiac_trauma = hospitals[hospitals['specialty'].isin(['Cardiac', 'Trauma'])]
print(cardiac_trauma[['hospital_name', 'specialty']])

Sorting

In [None]:
# Sort hospitals by bed count (ascending - default)
sorted_by_beds_asc = hospitals.sort_values('bed_count')
print(sorted_by_beds_asc[['hospital_name', 'bed_count']])

# Sort hospitals by bed count (descending)
sorted_by_beds_desc = hospitals.sort_values('bed_count', ascending=False)
print(sorted_by_beds_desc[['hospital_name', 'bed_count']])

# Sort hospitals by patient satisfaction (descending)
sorted_by_satisfaction = hospitals.sort_values('patient_satisfaction', ascending=False)
print(sorted_by_satisfaction[['hospital_name', 'patient_satisfaction']])

In [None]:
# Sort by specialty first, then by bed count within each specialty
sorted_specialty_beds = hospitals.sort_values(['specialty', 'bed_count'])
print(sorted_specialty_beds[['hospital_name', 'specialty', 'bed_count']])

# Sort by specialty (ascending) and patient satisfaction (descending)
sorted_mixed = hospitals.sort_values(['specialty', 'patient_satisfaction'], ascending=[True, False]
print(sorted_mixed[['hospital_name', 'specialty', 'patient_satisfaction']])

# Sort locations by state, then by city
sorted_locations = locations.sort_values(['state', 'city'])
print(sorted_locations[['city', 'state', 'region']])

# Sort and reset index
sorted_reset = hospitals.sort_values('patient_satisfaction', ascending=False).reset_index(drop=True)
print(sorted_reset[['hospital_name', 'patient_satisfaction']])

# Sort and keep original index as a column
sorted_keep_index = hospitals.sort_values('bed_count').reset_index()
print(sorted_keep_index[['index', 'hospital_name', 'bed_count']])

Merging

In [None]:
# Inner join (Default Merge) - only matching records from both DataFrames
inner_merged = pd.merge(hospitals, locations, on='hospital_id')
print(f"Shape: {inner_merged.shape}")
print(inner_merged[['hospital_name', 'city', 'state', 'bed_count']])

# Same result using merge method on DataFrame
inner_merged_alt = hospitals.merge(locations, on='hospital_id')
print(inner_merged_alt[['hospital_name', 'city', 'state', 'specialty']])

# Left join - all records from left DataFrame
left_merged = pd.merge(hospitals, locations, on='hospital_id', how='left')
print(f"Shape: {left_merged.shape}")
print(left_merged[['hospital_name', 'city', 'state']])

# Right join - all records from right DataFrame
right_merged = pd.merge(hospitals, locations, on='hospital_id', how='right')
print(f"Shape: {right_merged.shape}")
print(right_merged[['hospital_name', 'city', 'state']])

# Outer join - all records from both DataFrames
outer_merged = pd.merge(hospitals, locations, on='hospital_id', how='outer')
print(f"Shape: {outer_merged.shape}")
print(outer_merged[['hospital_name', 'city', 'state']])

In [None]:
locations_subset = locations[locations['state'].isin(['CA', 'NY', 'TX'])].copy()
partial_merge = pd.merge(hospitals, locations_subset, on='hospital_id', how='left')

locations_with_conflict = locations.copy()
locations_with_conflict['name'] = locations_with_conflict['city'] + ' Location'

# Merge with suffixes
merge_with_suffixes = pd.merge(
    hospitals,
    locations_with_conflict,
    on='hospital_id',
    suffixes=('_hospital', '_location')
)

Combining Filtering, Sorting, and Merging

In [None]:
# Step 1: Merge the DataFrames
complete_data = pd.merge(hospitals, locations, on='hospital_id')

# Step 2: Filter for specific criteria
# Large hospitals (>300 beds) with emergency services in specific regions
filtered_data = complete_data[
    (complete_data['bed_count'] > 300) &
    (complete_data['emergency_services'] == True) &
    (complete_data['region'].isin(['West', 'Northeast']))
]

# Step 3: Sort by patient satisfaction (descending)
final_result = filtered_data.sort_values('patient_satisfaction', ascending=False)
print(final_result[['hospital_name', 'city', 'state', 'bed_count', 'patient_satisfaction', 'region']])

Statistical Descritpive Analysis

In [None]:
# Merge data for analysis
analysis_data = pd.merge(hospitals, locations, on='hospital_id')

# Group by region and calculate statistics
regional_stats = analysis_data.groupby('region').agg({
    'bed_count': ['mean', 'max', 'min'],
    'patient_satisfaction': ['mean', 'max', 'min'],
    'hospital_id': 'count'
}).round(2)

# Flatten column names
regional_stats.columns = ['_'.join(col).strip() for col in regional_stats.columns]
regional_stats = regional_stats.rename(columns={'hospital_id_count': 'hospital_count'})

# Find top hospitals by region
top_by_region = analysis_data.loc[analysis_data.groupby('region')['patient_satisfaction'].idxmax()]
print(top_by_region[['hospital_name', 'region', 'patient_satisfaction', 'city']])

In [None]:
# Create a comprehensive hospital report
def create_hospital_report(hospitals_df, locations_df):
    # Merge data
    merged_data = pd.merge(hospitals_df, locations_df, on='hospital_id')

    # Filter for high-quality hospitals (satisfaction > 4.0)
    quality_hospitals = merged_data[merged_data['patient_satisfaction'] > 4.0]

    # Sort by bed count descending
    sorted_quality = quality_hospitals.sort_values('bed_count', ascending=False)

    # Create summary statistics
    summary_stats = {
        'total_hospitals': len(merged_data),
        'quality_hospitals': len(quality_hospitals),
        'avg_bed_count': merged_data['bed_count'].mean(),
        'avg_satisfaction': merged_data['patient_satisfaction'].mean(),
        'emergency_services_count': merged_data['emergency_services'].sum()
    }

    return sorted_quality, summary_stats

# Generate report
quality_report, stats = create_hospital_report(hospitals, locations)

Basic Transformations

In [None]:
hospitals['bed_count'] = pd.to_numeric(hospitals['bed_count'])

Apply Transformation

In [None]:
def categorize_stay(stay_days):
    """Categorize hospital stay duration"""
    if stay_days <= 3:
        return 'Short'
    elif stay_days <= 7:
        return 'Medium'
    else:
        return 'Long'

# Apply the function to create a new column
df['stay_category'] = df['stay_duration'].apply(categorize_stay)

In [None]:
def calculate_cost_per_day(group):
    group['cost_per_day'] = group['admission_cost'] / group['stay_duration']
    return group

# Apply the function to each hospital group
df_with_cost_per_day = df.groupby('hospital').apply(calculate_cost_per_day)
print(df_with_cost_per_day[['hospital', 'stay_duration', 'admission_cost', 'cost_per_day']].head(10))

In [None]:
def hospital_efficiency_metrics(group):
    metrics = pd.Series({
        'avg_stay': group['stay_duration'].mean(),
        'avg_cost': group['admission_cost'].mean(),
        'efficiency_ratio': group['admission_cost'].mean() / group['stay_duration'].mean(),
        'patient_count': len(group),
        'cost_variance': group['admission_cost'].var()
    })
    return metrics

# Apply to hospital groups
hospital_efficiency = df.groupby('hospital').apply(hospital_efficiency_metrics)
print(hospital_efficiency.round(2))

Chaining

In [None]:
def department_analysis(group):
    """Analyze department performance within each hospital"""
    return pd.Series({
        'avg_age': group['age'].mean(),
        'avg_stay': group['stay_duration'].mean(),
        'avg_cost': group['admission_cost'].mean(),
        'patient_count': len(group),
        'age_stay_correlation': group['age'].corr(group['stay_duration'])
    })

# Apply to hospital-department groups
dept_analysis = df.groupby(['hospital', 'department']).apply(department_analysis)
print(dept_analysis.head(10).round(2))

In [None]:
long_stay_avg_cost = (df
                     .query('stay_duration > 5')  # Filter
                     .groupby('hospital')         # Group
                     ['admission_cost']           # Select column
                     .mean()                      # Calculate mean
                     .round(2))                   # Round results

print("Average cost for patients with stay > 5 days:")
print(long_stay_avg_cost)

In [None]:
elderly_private_analysis = (df
                           .query('age > 65 and insurance_type == "Private"')
                           .groupby(['hospital', 'department'])
                           .agg({
                               'stay_duration': ['mean', 'count'],
                               'admission_cost': ['mean', 'sum'],
                               'age': 'mean'
                           })
                           .round(2))

print("Analysis for elderly patients with private insurance:")
print(elderly_private_analysis.head(10))

In [None]:
def advanced_group_analysis(group):
    """Perform advanced analysis on filtered groups"""
    if len(group) < 5:  # Skip groups with too few patients
        return None

    return pd.Series({
        'patient_count': len(group),
        'avg_stay': group['stay_duration'].mean(),
        'stay_std': group['stay_duration'].std(),
        'cost_efficiency': group['admission_cost'].sum() / group['stay_duration'].sum(),
        'high_cost_patients': (group['admission_cost'] > group['admission_cost'].quantile(0.75)).sum()
    })

# Chain operations with custom apply
advanced_results = (df
                   .query('stay_duration >= 3 and admission_cost > 5000')  # Filter
                   .groupby('hospital')                                     # Group
                   .apply(advanced_group_analysis)                         # Apply custom function
                   .dropna()                                               # Remove None results
                   .round(2))                                              # Round
print(advanced_results)

In [None]:
top_departments = (df
                  .query('insurance_type in ["Private", "Medicare"]')  # Filter insurance types
                  .groupby(['hospital', 'department'])                 # Group by hospital and department
                  .agg({
                      'admission_cost': 'mean',
                      'stay_duration': 'mean',
                      'patient_id': 'count'
                  })                                                   # Aggregate
                  .rename(columns={'patient_id': 'patient_count'})     # Rename column
                  .query('patient_count >= 10')                       # Filter groups with enough patients
                  .sort_values('admission_cost', ascending=False)      # Sort by cost
                  .head(10))                                          # Select top 10

print("Top 10 Most Expensive Department-Hospital Combinations:")
print(top_departments.round(2))

In [None]:
def comprehensive_hospital_analysis(df):
    """Complete hospital analysis pipeline"""

    # Step 1: Data preparation and filtering
    filtered_data = (df
                    .query('age >= 18 and stay_duration > 0')  # Basic data quality filters
                    .copy())

    # Step 2: Add calculated columns using apply
    filtered_data['cost_per_day'] = filtered_data['admission_cost'] / filtered_data['stay_duration']
    filtered_data['age_group'] = filtered_data['age'].apply(
        lambda x: 'Young' if x < 35 else 'Middle' if x < 65 else 'Senior'
    )

    # Step 3: Hospital-level analysis
    hospital_summary = (filtered_data
                       .groupby('hospital')
                       .agg({
                           'patient_id': 'count',
                           'stay_duration': ['mean', 'median', 'std'],
                           'admission_cost': ['mean', 'sum'],
                           'cost_per_day': 'mean',
                           'age': 'mean'
                       })
                       .round(2))

    # Step 4: Department analysis within hospitals
    dept_analysis = (filtered_data
                    .groupby(['hospital', 'department'])
                    .apply(lambda x: pd.Series({
                        'patient_count': len(x),
                        'avg_stay': x['stay_duration'].mean(),
                        'avg_cost': x['admission_cost'].mean(),
                        'efficiency_score': x['admission_cost'].mean() / x['stay_duration'].mean()
                    }))
                    .reset_index()
                    .sort_values('efficiency_score', ascending=False))

    return hospital_summary, dept_analysis

# Run the comprehensive analysis
hospital_results, department_results = comprehensive_hospital_analysis(df)

print("=== COMPREHENSIVE HOSPITAL ANALYSIS ===")
print("\n1. Hospital Summary Statistics:")
print(hospital_results)

print("\n2. Top 10 Most Efficient Departments:")
print(department_results.head(10))