In [None]:
#** This dataset is specifically tailored for graph analysis of diseases. 
#It provides a multi-layered relational structure linking patients, diseases, families, and symptoms.
#Through well-defined statistical distributions and scalable size, it forms a rich ground for exploring disease networks, symptom communities, 
#and patient similarity clusters, unlocking insights into complex healthcare patterns that are not easily discerned from flat, tabular data alone*
#* [Dataset link](https://www.kaggle.com/datasets/ankurnapa/hospital-management-dataset?utm_source=chatgpt.com)

#Group Members

#N Ncobela 22423967
#LN Buthelezi 22422133
#SS Sitole 2242901 

# 📌 1. Imports
import zipfile
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from datetime import datetime, timedelta

In [None]:
# 📌 2. Function to load CSVs from a zip
def read_csv_from_zip(zpath, filename):
    try:
        with zipfile.ZipFile(zpath) as z:
            with z.open(filename) as f:
                return pd.read_csv(f)
    except Exception as e:
        print(f"❌ Error reading {filename}: {e}")
        return pd.DataFrame()

In [None]:
# 📌 3. Function to generate sample data
def generate_sample_data():
    dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
    
    appointments_data = {
        'appointment_id': range(1, 1001),
        'patient_id': np.random.randint(1, 201, 1000),
        'doctor_id': np.random.randint(1, 11, 1000),
        'appointment_date': np.random.choice(dates, 1000),
        'status': np.random.choice(
            ['Completed', 'Cancelled', 'No Show', 'Scheduled'],
            1000, p=[0.7, 0.1, 0.1, 0.1]
        )
    }
    appointments = pd.DataFrame(appointments_data)

    doctors_data = {
        'doctor_id': range(1, 11),
        'name': [f'Dr. {name}' for name in 
                 ['Smith','Johnson','Williams','Brown','Jones',
                  'Garcia','Miller','Davis','Rodriguez','Martinez']],
        'specialty': np.random.choice(
            ['Cardiology','Pediatrics','Orthopedics','Neurology','Dermatology'], 10)
    }
    doctors = pd.DataFrame(doctors_data)

    patients_data = {
        'patient_id': range(1, 201),
        'name': [f'Patient {i}' for i in range(1, 201)],
        'age': np.random.randint(18, 80, 200),
        'gender': np.random.choice(['Male','Female'], 200)
    }
    patients = pd.DataFrame(patients_data)

    return appointments, doctors, patients

In [None]:
# 📌 4. Load data 
ZIP_PATH = "archive (3).zip"

if not os.path.exists(ZIP_PATH):
    print(f"⚠️ Zip file not found at {ZIP_PATH}. Using sample data.")
    appointments, doctors, patients = generate_sample_data()
else:
    appointments = read_csv_from_zip(ZIP_PATH, "appointments.csv")
    doctors = read_csv_from_zip(ZIP_PATH, "doctors.csv")
    patients = read_csv_from_zip(ZIP_PATH, "patients.csv")

appointments.head()

In [None]:
# 📌 5. Preprocessing
appointments['appointment_date'] = pd.to_datetime(appointments['appointment_date'], errors='coerce')
appointments['is_completed'] = appointments['status'].astype(str).str.lower().eq('completed').astype(int)

daily = (appointments.groupby('appointment_date')['is_completed']
         .sum()
         .rename('demand')
         .reset_index()
         .rename(columns={'appointment_date':'date'}))

date_index = pd.date_range(daily['date'].min(), daily['date'].max(), freq='D')
daily = daily.set_index('date').reindex(date_index).fillna(0.0).rename_axis('date').reset_index()

daily.head()

In [None]:
# 📌 6. Plot daily visits
plt.figure(figsize=(12,5))
plt.plot(daily['date'], daily['demand'], marker='o', markersize=2)
plt.title("Daily Completed Visits")
plt.xlabel("Date")
plt.ylabel("Completed Visits")
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

In [None]:
# 📌 7. Feature engineering
daily['year'] = daily['date'].dt.year
daily['month'] = daily['date'].dt.month
daily['day'] = daily['date'].dt.day
daily['dow'] = daily['date'].dt.dayofweek
daily['is_month_start'] = daily['date'].dt.is_month_start.astype(int)
daily['is_month_end'] = daily['date'].dt.is_month_end.astype(int)

for lag in [1, 7, 14]:
    daily[f'lag_{lag}'] = daily['demand'].shift(lag)

daily['roll7_mean'] = daily['demand'].rolling(7).mean()
daily['roll14_mean'] = daily['demand'].rolling(14).mean()

daily_model = daily.dropna().reset_index(drop=True)

feature_cols = ['year','month','day','dow','is_month_start','is_month_end',
                'lag_1','lag_7','lag_14','roll7_mean','roll14_mean']

X = daily_model[feature_cols]
y = daily_model['demand']

In [None]:
# 📌 8. Train/test split
split_idx = int(len(daily_model) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
dates_test = daily_model['date'].iloc[split_idx:]

In [None]:
# 📌 9. Train RandomForest
model = RandomForestRegressor(n_estimators=500, random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))

print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}")

In [None]:
# 📌 10. Plot Actual data vs Predicted data
plt.figure(figsize=(12,5))
plt.plot(dates_test, y_test.values, label="Actual", marker='o', markersize=3)
plt.plot(dates_test, preds, label="Predicted", linestyle='--')
plt.title("Actual vs Predicted Visits")
plt.xlabel("Date")
plt.ylabel("Completed Visits")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

In [None]:
# 📌 11. Forecast future days
def forecast_future_days(daily_full_df, model, horizon=14):
    df = daily_full_df.copy().set_index('date').sort_index()
    last_date = df.index.max()
    preds = []

    future_idx = pd.date_range(last_date + pd.Timedelta(days=1),
                               last_date + pd.Timedelta(days=horizon),
                               freq='D')

    for next_date in future_idx:
        idx = pd.date_range(df.index.min(), next_date, freq='D')
        tmp = df.reindex(idx)

        tmp['year'] = tmp.index.year
        tmp['month'] = tmp.index.month
        tmp['day'] = tmp.index.day
        tmp['dow'] = tmp.index.dayofweek
        tmp['is_month_start'] = tmp.index.is_month_start.astype(int)
        tmp['is_month_end'] = tmp.index.is_month_end.astype(int)

        tmp['lag_1'] = tmp['demand'].shift(1)
        tmp['lag_7'] = tmp['demand'].shift(7)
        tmp['lag_14'] = tmp['demand'].shift(14)
        tmp['roll7_mean'] = tmp['demand'].rolling(7).mean()
        tmp['roll14_mean'] = tmp['demand'].rolling(14).mean()

        feat_row = tmp.loc[next_date, 
            ['year','month','day','dow','is_month_start','is_month_end',
             'lag_1','lag_7','lag_14','roll7_mean','roll14_mean']].values.reshape(1,-1)

        pred = model.predict(feat_row)[0]
        preds.append((next_date, pred))
        df.loc[next_date, 'demand'] = pred

    return pd.DataFrame(preds, columns=['date','predicted_demand'])

In [None]:
# 📌 12. Forecast next 14 days
future_preds = forecast_future_days(daily, model, horizon=14)
future_preds

In [None]:
# 📌 13. Plot forecast
plt.figure(figsize=(12,6))
plt.plot(daily['date'], daily['demand'], label="Historical")
plt.plot(future_preds['date'], future_preds['predicted_demand'], 
         label="Forecast", marker='o', linestyle='--')
plt.title("Forecast of Next 14 Days of Completed Visits")
plt.xlabel("Date")
plt.ylabel("Completed Visits")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()