# Healthcare Appointment No-Show Prediction

This notebook performs exploratory data analysis and builds a machine learning model to predict patient no-shows for healthcare appointments.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

## 2. Load and Preprocess Data

In [None]:
# For demonstration purposes, we'll create a sample dataset
np.random.seed(42)
n_samples = 1000

# Generate sample data
ages = np.random.randint(0, 100, n_samples)
genders = np.random.choice(['M', 'F'], n_samples)
scheduled_dates = pd.date_range(start='2022-01-01', periods=n_samples, freq='D')
appointment_dates = scheduled_dates + pd.to_timedelta(np.random.randint(1, 30, n_samples), unit='D')
sms_received = np.random.choice([0, 1], n_samples, p=[0.3, 0.7])
neighborhoods = np.random.choice(['Downtown', 'Uptown', 'Midtown', 'Suburb', 'Rural'], n_samples)
no_show = np.random.choice(['No', 'Yes'], n_samples, p=[0.8, 0.2])  # 20% no-show rate

# Create dataframe
df = pd.DataFrame({
    'PatientID': range(1, n_samples + 1),
    'Age': ages,
    'Gender': genders,
    'ScheduledDay': scheduled_dates,
    'AppointmentDay': appointment_dates,
    'SMS_received': sms_received,
    'Neighbourhood': neighborhoods,
    'Hypertension': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]),
    'Diabetes': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
    'Alcoholism': np.random.choice([0, 1], n_samples, p=[0.9, 0.1]),
    'Handicap': np.random.choice([0, 1, 2, 3, 4], n_samples, p=[0.8, 0.1, 0.05, 0.03, 0.02]),
    'No-show': no_show
})

# Display the first few rows
df.head()

### Data Cleaning and Feature Engineering

In [None]:
# Convert date columns to datetime
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])

# Calculate lead time (days between scheduling and appointment)
df['LeadTime'] = (df['AppointmentDay'] - df['ScheduledDay']).dt.days

# Extract day of week
df['AppointmentDayOfWeek'] = df['AppointmentDay'].dt.day_name()

# Convert target variable to binary
df['No-show'] = df['No-show'].map({'No': 0, 'Yes': 1})

# Handle missing values
df = df.fillna({
    'Age': df['Age'].median(),
    'Gender': df['Gender'].mode()[0],
    'Neighbourhood': df['Neighbourhood'].mode()[0],
    'SMS_received': 0
})

# Remove any remaining rows with missing values
df = df.dropna()

# Display the processed data
df.head()

## 3. Exploratory Data Analysis

In [None]:
# Basic statistics
print("Dataset shape:", df.shape)
print("\nBasic statistics:")
df.describe()


In [None]:
# Check target variable distribution
no_show_counts = df['No-show'].value_counts(normalize=True) * 100
print(f"No-show distribution: {no_show_counts[1]:.2f}% no-shows, {no_show_counts[0]:.2f}% shows")

plt.figure(figsize=(8, 6))
sns.countplot(x='No-show', data=df)
plt.title('Distribution of No-shows')
plt.show()

### Age Distribution by No-show Status

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='No-show', y='Age', data=df)
plt.title('Age Distribution by No-show Status')
plt.show()

# Age groups analysis
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 18, 35, 50, 65, 100], labels=['0-18', '19-35', '36-50', '51-65', '65+'])
age_no_show = pd.crosstab(df['AgeGroup'], df['No-show'], normalize='index') * 100

plt.figure(figsize=(10, 6))
age_no_show[1].plot(kind='bar')
plt.title('No-show Rate by Age Group')
plt.ylabel('No-show Rate (%)')
plt.show()

### Lead Time Analysis

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='LeadTime', hue='No-show', bins=30, kde=True)
plt.title('Lead Time Distribution by No-show Status')
plt.show()

# Lead time groups analysis
df['LeadTimeGroup'] = pd.cut(df['LeadTime'], bins=[0, 7, 14, 30, 100], labels=['0-7 days', '8-14 days', '15-30 days', '30+ days'])
lead_time_no_show = pd.crosstab(df['LeadTimeGroup'], df['No-show'], normalize='index') * 100

plt.figure(figsize=(10, 6))
lead_time_no_show[1].plot(kind='bar')
plt.title('No-show Rate by Lead Time Group')
plt.ylabel('No-show Rate (%)')
plt.show()

### Day of Week Analysis

In [None]:
plt.figure(figsize=(12, 6))
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_no_show = pd.crosstab(df['AppointmentDayOfWeek'], df['No-show'], normalize='index') * 100
day_no_show = day_no_show.reindex(day_order)
day_no_show[1].plot(kind='bar')
plt.title('No-show Rate by Day of Week')
plt.ylabel('No-show Rate (%)')
plt.show()

### SMS Received Analysis

In [None]:
plt.figure(figsize=(10, 6))
sms_no_show = pd.crosstab(df['SMS_received'], df['No-show'], normalize='index') * 100
sms_no_show[1].plot(kind='bar')
plt.title('No-show Rate by SMS Received')
plt.xlabel('SMS Received (1=Yes, 0=No)')
plt.ylabel('No-show Rate (%)')
plt.show()

### Health Conditions Analysis

In [None]:
# Create a figure with subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Hypertension
hypertension_no_show = pd.crosstab(df['Hypertension'], df['No-show'], normalize='index') * 100
hypertension_no_show[1].plot(kind='bar', ax=axes[0])
axes[0].set_title('No-show Rate by Hypertension')
axes[0].set_xlabel('Hypertension (1=Yes, 0=No)')
axes[0].set_ylabel('No-show Rate (%)')

# Diabetes
diabetes_no_show = pd.crosstab(df['Diabetes'], df['No-show'], normalize='index') * 100
diabetes_no_show[1].plot(kind='bar', ax=axes[1])
axes[1].set_title('No-show Rate by Diabetes')
axes[1].set_xlabel('Diabetes (1=Yes, 0=No)')
axes[1].set_ylabel('No-show Rate (%)')

# Alcoholism
alcoholism_no_show = pd.crosstab(df['Alcoholism'], df['No-show'], normalize='index') * 100
alcoholism_no_show[1].plot(kind='bar', ax=axes[2])
axes[2].set_title('No-show Rate by Alcoholism')
axes[2].set_xlabel('Alcoholism (1=Yes, 0=No)')
axes[2].set_ylabel('No-show Rate (%)')

plt.tight_layout()
plt.show()

## 4. Model Building

In [None]:
# Prepare features and target
X = df.drop(['No-show', 'PatientID', 'ScheduledDay', 'AppointmentDay', 'AgeGroup', 'LeadTimeGroup'], axis=1)
y = df['No-show']

# Handle categorical features
categorical_features = ['Gender', 'Neighbourhood', 'AppointmentDayOfWeek']
numerical_features = ['Age', 'LeadTime', 'SMS_received', 'Hypertension', 'Diabetes', 'Alcoholism', 'Handicap']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Decision Tree Model

In [None]:
# Create and train model pipeline
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])
dt_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = dt_pipeline.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Model Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Show', 'No-show'],
            yticklabels=['Show', 'No-show'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### Feature Importance

In [None]:
# Get feature names after preprocessing
feature_names = []
for name, trans, cols in preprocessor.transformers_:
    if name == 'num':
        feature_names.extend(cols)
    elif name == 'cat':
        for col in cols:
            feature_names.extend([f"{col}_{cat}" for cat in trans.categories_[0]])

# Get feature importances
importances = dt_pipeline.named_steps['classifier'].feature_importances_

# Plot top 10 features
plt.figure(figsize=(12, 8))
indices = np.argsort(importances)[-10:]
plt.barh(range(len(indices)), importances[indices])
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Feature Importance')
plt.title('Top 10 Feature Importances')
plt.show()

## 5. Optimization Suggestions

In [None]:
# Calculate no-show rates by different factors
day_no_show = pd.crosstab(df['AppointmentDayOfWeek'], df['No-show'], normalize='index') * 100
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_no_show = day_no_show.reindex(day_order)

high_risk_days = day_no_show[1].nlargest(3).index.tolist()
low_risk_days = day_no_show[1].nsmallest(3).index.tolist()

# SMS effectiveness
sms_no_show = pd.crosstab(df['SMS_received'], df['No-show'], normalize='index') * 100
sms_effectiveness = sms_no_show[1][0] - sms_no_show[1][1] if 1 in sms_no_show[1].index else 0

# Lead time analysis
lead_time_groups = pd.cut(df['LeadTime'], bins=[0, 7, 14, 30, 100], labels=['0-7 days', '8-14 days', '15-30 days', '30+ days'])
lead_time_no_show = pd.crosstab(lead_time_groups, df['No-show'], normalize='index') * 100

# Print suggestions
print("Optimization Suggestions:")
print(f"1. Schedule Optimization:")
print(f"   - High-risk days for no-shows: {', '.join(high_risk_days)}")
print(f"   - Consider overbooking by {precision*100:.1f}% on these days")
print(f"   - Low-risk days: {', '.join(low_risk_days)}")

print(f"\n2. Patient Communication:")
if sms_effectiveness > 0:
    print(f"   - SMS reminders reduce no-show rate by approximately {sms_effectiveness:.1f}%")
    print(f"   - Implement automated SMS reminders for all appointments")
    print(f"   - Consider sending multiple reminders (3 days before and day before)")

print(f"\n3. Appointment Lead Time Management:")
print(f"   - Optimal appointment lead time: {lead_time_no_show[1].idxmin()} (lowest no-show rate)")
print(f"   - Try to schedule appointments within this timeframe when possible")

print(f"\n4. Targeted Interventions:")
print(f"   - Model accuracy: {accuracy*100:.1f}%")
print(f"   - Use the prediction model to identify high-risk patients")
print(f"   - Implement phone call confirmations for patients with >50% no-show probability")