# Hospital Readmission Prediction Project
## Phase 1-3: Data Loading, Cleaning & EDA

**Author:** Vindya Siriwardhana  
**Dataset:** Diabetes 130-US Hospitals  
**Goal:** Predict 30-day hospital readmissions using machine learning

---
## PHASE 1: SETUP & DATA (Steps 1-3)

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Settings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("‚úÖ Libraries imported successfully!")

### Step 2: Load Data

In [None]:
# Load the dataset
df = pd.read_csv('/mnt/user-data/uploads/diabetic_data.csv')

print("‚úÖ Data loaded successfully!")
print(f"\nüìä Dataset Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")

In [None]:
# Initial inspection
print("\n" + "="*80)
print("INITIAL DATA INSPECTION")
print("="*80)

# Display first few rows
print("\nüìã First 5 rows:")
df.head()

In [None]:
# Column names and types
print("\nüìä Column Information:")
df.info()

In [None]:
# Basic statistics
print("\nüìà Numerical Columns Statistics:")
df.describe()

### Step 3: Understand Target Variable

In [None]:
# Check target variable
print("\n" + "="*80)
print("TARGET VARIABLE ANALYSIS")
print("="*80)

print("\nüéØ Readmission Distribution:")
print(df['readmitted'].value_counts())
print("\nüìä Readmission Percentages:")
print(df['readmitted'].value_counts(normalize=True) * 100)

In [None]:
# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
df['readmitted'].value_counts().plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c', '#3498db'])
axes[0].set_title('Readmission Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Readmission Status')
axes[0].set_ylabel('Count')
axes[0].tick_params(rotation=0)

# Percentage plot
(df['readmitted'].value_counts(normalize=True) * 100).plot(kind='bar', ax=axes[1], color=['#2ecc71', '#e74c3c', '#3498db'])
axes[1].set_title('Readmission Distribution (Percentage)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Readmission Status')
axes[1].set_ylabel('Percentage (%)')
axes[1].tick_params(rotation=0)

plt.tight_layout()
plt.show()

print("\n‚úÖ Target variable analyzed!")

### Step 3: Create Binary Target Variable

**Strategy:** Convert to binary classification:
- `NO` ‚Üí 0 (Not readmitted)
- `<30` ‚Üí 1 (Readmitted within 30 days) ‚ö†Ô∏è **HIGH RISK**
- `>30` ‚Üí 0 (Readmitted after 30 days - not our focus)

In [None]:
# Create binary target: 1 if readmitted <30 days, 0 otherwise
df['readmitted_binary'] = (df['readmitted'] == '<30').astype(int)

print("\nüéØ Binary Target Distribution:")
print(df['readmitted_binary'].value_counts())
print("\nüìä Binary Target Percentages:")
print(df['readmitted_binary'].value_counts(normalize=True) * 100)

# Check class imbalance
imbalance_ratio = df['readmitted_binary'].value_counts()[0] / df['readmitted_binary'].value_counts()[1]
print(f"\n‚öñÔ∏è Class Imbalance Ratio: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 3:
    print("‚ö†Ô∏è WARNING: Significant class imbalance detected! Will need SMOTE or class weighting.")

print("\n‚úÖ Binary target variable created!")

---
## PHASE 2: DATA CLEANING (Steps 4-6)

### Step 4: Handle Missing Values

In [None]:
print("\n" + "="*80)
print("MISSING VALUES ANALYSIS")
print("="*80)

# Check for missing values (including '?')
missing_counts = df.isnull().sum()
question_mark_counts = (df == '?').sum()

# Combine both
total_missing = missing_counts + question_mark_counts
missing_percentage = (total_missing / len(df)) * 100

# Create summary
missing_summary = pd.DataFrame({
    'Missing_Count': total_missing,
    'Missing_Percentage': missing_percentage
}).sort_values('Missing_Percentage', ascending=False)

# Show columns with missing values
print("\nüìä Columns with Missing Values (>0%):")
print(missing_summary[missing_summary['Missing_Percentage'] > 0])

# Visualize
high_missing = missing_summary[missing_summary['Missing_Percentage'] > 30]
if len(high_missing) > 0:
    print(f"\n‚ö†Ô∏è {len(high_missing)} columns have >30% missing values")
    print("\nColumns to consider dropping:")
    print(high_missing)

In [None]:
# Visualize missing data
top_missing = missing_summary[missing_summary['Missing_Percentage'] > 0].head(15)

if len(top_missing) > 0:
    plt.figure(figsize=(12, 6))
    plt.barh(top_missing.index, top_missing['Missing_Percentage'], color='coral')
    plt.xlabel('Missing Percentage (%)')
    plt.title('Top 15 Columns with Missing Values', fontsize=14, fontweight='bold')
    plt.axvline(x=30, color='red', linestyle='--', label='30% threshold')
    plt.legend()
    plt.tight_layout()
    plt.show()

print("\n‚úÖ Missing values analyzed!")

### Step 5: Remove Duplicates and Irrelevant Columns

In [None]:
print("\n" + "="*80)
print("DATA CLEANING: DUPLICATES & IRRELEVANT COLUMNS")
print("="*80)

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nüîç Duplicate rows found: {duplicates:,}")

if duplicates > 0:
    print(f"‚ö†Ô∏è Removing {duplicates:,} duplicate rows...")
    df = df.drop_duplicates()
    print(f"‚úÖ After removal: {df.shape[0]:,} rows")

# Identify columns to potentially drop
print("\nüìã Columns Analysis:")
print(f"Total columns: {df.shape[1]}")

# Columns that might be irrelevant
id_columns = ['encounter_id', 'patient_nbr']  # IDs don't help prediction
print(f"\nüî∏ ID columns (to drop): {id_columns}")

# Save original shape
original_shape = df.shape
print(f"\nüìä Current shape: {original_shape}")

### Step 6: Initial Data Type Check

In [None]:
print("\n" + "="*80)
print("DATA TYPES ANALYSIS")
print("="*80)

# Categorical vs Numerical
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nüìä Categorical columns ({len(categorical_cols)}):")
print(categorical_cols[:10], "..." if len(categorical_cols) > 10 else "")

print(f"\nüìä Numerical columns ({len(numerical_cols)}):")
print(numerical_cols)

# Check unique values for categorical columns (to identify high cardinality)
print("\nüìä Unique values in categorical columns:")
for col in categorical_cols[:10]:  # Show first 10
    unique_count = df[col].nunique()
    print(f"{col:30s}: {unique_count:5d} unique values")

print("\n‚úÖ Data types analyzed!")

---
## PHASE 3: EXPLORATORY DATA ANALYSIS (Steps 7-9)

### Step 7: Descriptive Statistics

In [None]:
print("\n" + "="*80)
print("DESCRIPTIVE STATISTICS")
print("="*80)

# Age distribution
print("\nüë• Age Distribution:")
print(df['age'].value_counts().sort_index())

In [None]:
# Gender distribution
print("\nüë• Gender Distribution:")
print(df['gender'].value_counts())
print("\nPercentages:")
print(df['gender'].value_counts(normalize=True) * 100)

In [None]:
# Race distribution
print("\nüåç Race Distribution:")
print(df['race'].value_counts())
print("\nPercentages:")
print(df['race'].value_counts(normalize=True) * 100)

In [None]:
# Hospital stay statistics
print("\nüè• Time in Hospital Statistics:")
print(df['time_in_hospital'].describe())

print("\nüíä Number of Medications Statistics:")
print(df['num_medications'].describe())

print("\nüî¨ Number of Lab Procedures Statistics:")
print(df['num_lab_procedures'].describe())

### Step 8: Visualizations

In [None]:
print("\n" + "="*80)
print("DATA VISUALIZATIONS")
print("="*80)

In [None]:
# 1. Age vs Readmission
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Age distribution by readmission
age_readmit = pd.crosstab(df['age'], df['readmitted_binary'], normalize='index') * 100
age_readmit.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Readmission Rate by Age Group', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Age Group')
axes[0].set_ylabel('Percentage (%)')
axes[0].legend(['Not Readmitted', 'Readmitted <30 days'])
axes[0].tick_params(rotation=45)

# Age group counts
df['age'].value_counts().sort_index().plot(kind='bar', ax=axes[1], color='skyblue')
axes[1].set_title('Patient Count by Age Group', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Age Group')
axes[1].set_ylabel('Count')
axes[1].tick_params(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# 2. Number of Medications vs Readmission
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Box plot
df.boxplot(column='num_medications', by='readmitted_binary', ax=axes[0])
axes[0].set_title('Medications by Readmission Status', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Readmitted (<30 days)')
axes[0].set_ylabel('Number of Medications')
plt.sca(axes[0])
plt.xticks([1, 2], ['No', 'Yes'])

# Histogram
df[df['readmitted_binary']==0]['num_medications'].hist(bins=30, alpha=0.5, label='Not Readmitted', ax=axes[1], color='green')
df[df['readmitted_binary']==1]['num_medications'].hist(bins=30, alpha=0.5, label='Readmitted <30', ax=axes[1], color='red')
axes[1].set_title('Distribution of Medications', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Number of Medications')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# 3. Length of Stay vs Readmission
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Box plot
df.boxplot(column='time_in_hospital', by='readmitted_binary', ax=axes[0])
axes[0].set_title('Length of Stay by Readmission Status', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Readmitted (<30 days)')
axes[0].set_ylabel('Days in Hospital')
plt.sca(axes[0])
plt.xticks([1, 2], ['No', 'Yes'])

# Histogram
df[df['readmitted_binary']==0]['time_in_hospital'].hist(bins=14, alpha=0.5, label='Not Readmitted', ax=axes[1], color='green')
df[df['readmitted_binary']==1]['time_in_hospital'].hist(bins=14, alpha=0.5, label='Readmitted <30', ax=axes[1], color='red')
axes[1].set_title('Distribution of Hospital Stay Duration', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Days in Hospital')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# 4. Previous visits vs Readmission
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Outpatient visits
df.boxplot(column='number_outpatient', by='readmitted_binary', ax=axes[0])
axes[0].set_title('Outpatient Visits', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Readmitted')
axes[0].set_ylabel('Number of Visits')

# Emergency visits
df.boxplot(column='number_emergency', by='readmitted_binary', ax=axes[1])
axes[1].set_title('Emergency Visits', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Readmitted')
axes[1].set_ylabel('Number of Visits')

# Inpatient visits
df.boxplot(column='number_inpatient', by='readmitted_binary', ax=axes[2])
axes[2].set_title('Previous Inpatient Visits', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Readmitted')
axes[2].set_ylabel('Number of Visits')

plt.tight_layout()
plt.show()

### Step 9: Correlation Analysis

In [None]:
print("\n" + "="*80)
print("CORRELATION ANALYSIS")
print("="*80)

# Select numerical columns
numerical_features = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 
                     'num_medications', 'number_outpatient', 'number_emergency', 
                     'number_inpatient', 'number_diagnoses', 'readmitted_binary']

# Calculate correlation
correlation_matrix = df[numerical_features].corr()

# Visualize correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nüîç Correlation with Readmission:")
readmit_corr = correlation_matrix['readmitted_binary'].sort_values(ascending=False)
print(readmit_corr)

---
## üìä PHASE 1-3 SUMMARY

In [None]:
print("\n" + "="*80)
print("PHASE 1-3 COMPLETED SUMMARY")
print("="*80)

print("\n‚úÖ COMPLETED STEPS:")
print("   Step 1: ‚úì Dataset downloaded")
print("   Step 2: ‚úì Data loaded and inspected")
print("   Step 3: ‚úì Target variable analyzed and converted to binary")
print("   Step 4: ‚úì Missing values identified")
print("   Step 5: ‚úì Duplicates checked")
print("   Step 6: ‚úì Data types analyzed")
print("   Step 7: ‚úì Descriptive statistics generated")
print("   Step 8: ‚úì Visualizations created")
print("   Step 9: ‚úì Correlations analyzed")

print("\nüìä KEY FINDINGS:")
print(f"   ‚Ä¢ Total patients: {df.shape[0]:,}")
print(f"   ‚Ä¢ Total features: {df.shape[1]}")
print(f"   ‚Ä¢ Readmission rate (<30 days): {(df['readmitted_binary'].sum()/len(df)*100):.2f}%")
print(f"   ‚Ä¢ Class imbalance ratio: {imbalance_ratio:.2f}:1")

print("\nüéØ NEXT STEPS (Phase 4):")
print("   Step 10: Feature engineering (create comorbidity count, age groups, etc.)")
print("   Step 11: Encode diagnosis codes")
print("   Step 12: Create medication-related features")

print("\n" + "="*80)
print("Ready to proceed to Phase 4: Feature Engineering!")
print("="*80)

---
## üíæ SAVE CHECKPOINT

In [None]:
# Save the current dataframe for next phase
df.to_csv('/home/claude/hospital_readmission_phase1-3.csv', index=False)
print("\nüíæ Progress saved: hospital_readmission_phase1-3.csv")
print("\n‚úÖ Ready for Phase 4: Feature Engineering!")