In [1]:
# Step 1: Install requirements
!pip install shap dash jupyter-dash scikit-learn tensorflow pandas numpy matplotlib

Defaulting to user installation because normal site-packages is not writeable
Collecting shap
  Downloading shap-0.47.1-cp311-cp311-win_amd64.whl.metadata (25 kB)
Collecting dash
  Downloading dash-3.0.2-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl.metadata (3.6 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting Werkzeug<3.1 (from dash)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Collecting ansi2html (from jupyter-dash)
  Downloading ansi2html-1.9.2-py3-none-any.whl.metadata (3.7 kB)
Downloading shap-0.47.1-cp311-cp311-win_amd64.whl (489 kB)
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Downloading dash-3.0.2-py3-none-any.whl (7.9 MB)
   ---------------------------------------- 0.0/7.9 MB ? eta -:--:--
   ---------------------------------------

DEPRECATION: Loading egg at c:\program files\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
flask-babel 4.0.0 requires Jinja2>=3.1, but you have jinja2 3.0.3 which is incompatible.

[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Step 2-3: Import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
import shap
import matplotlib.pyplot as plt
from jupyter_dash import JupyterDash
from dash import dcc, html
import plotly.graph_objs as go
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Step 4-8: Data loading and preprocessing
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load and merge datasets
data_dir = '/content/drive/MyDrive/Practice_Level_Crosstab_Jan_24'
files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
dfs = [pd.read_csv(os.path.join(data_dir, f)) for f in files]
merged = pd.concat(dfs)

# Feature engineering
merged['Date'] = pd.to_datetime(merged['APPOINTMENT_MONTH_START_DATE'], format='%d%b%Y')
merged['day_of_week'] = merged['Date'].dt.dayofweek
merged['is_weekend'] = merged['day_of_week'].isin([5,6]).astype(int)
features = ['COUNT_OF_APPOINTMENTS', 'day_of_week', 'is_weekend']
target = 'COUNT_OF_APPOINTMENTS'

# Handle missing data
merged = merged.fillna(method='ffill')

# Normalization
scaler = RobustScaler()
scaled_data = scaler.fit_transform(merged[features])

ModuleNotFoundError: No module named 'google.colab'

In [None]:
# Step 9-11: Sequence creation
def create_sliding_windows(data, window_size=14):
    X_seq, y = [], []
    for i in range(len(data) - window_size):
        X_seq.append(data[i:(i + window_size)])
        y.append(data[i + window_size, 0])
    X_flat = np.array(X_seq).reshape(len(X_seq), -1)
    return np.array(X_seq), X_flat, np.array(y)

X_seq, X_flat, y = create_sliding_windows(scaled_data)

In [None]:
# Step 12: Model definitions
def build_lstm(input_shape):
    model = tf.keras.Sequential([
        LSTM(64, return_sequences=True, input_shape=input_shape),
        LSTM(32),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

def build_rf():
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    }
    return GridSearchCV(
        RandomForestRegressor(),
        param_grid,
        cv=TimeSeriesSplit(n_splits=3),
        scoring='neg_mean_squared_error'
    )

In [None]:
# Step 13-17: Training with validation
# LSTM Training
lstm_model = build_lstm((X_seq.shape[1], X_seq.shape[2]))
lstm_history = lstm_model.fit(
    X_seq, y,
    epochs=100,
    batch_size=32,
    validation_split=0.1,
    callbacks=[
        EarlyStopping(patience=10),
        ModelCheckpoint('best_lstm.h5', save_best_only=True)
    ]
)

# RF Training
rf_model = build_rf()
rf_model.fit(X_flat, y)
print(f"Best RF params: {rf_model.best_params_}")

In [None]:
# Step 18-19: Evaluation and SHAP
# LSTM Evaluation
lstm_preds = lstm_model.predict(X_seq)
print(f"LSTM MSE: {mean_squared_error(y, lstm_preds):.4f}")

# RF SHAP Analysis
explainer = shap.TreeExplainer(rf_model.best_estimator_)
shap_values = explainer.shap_values(X_flat)
shap.summary_plot(shap_values, X_flat, feature_names=features)

# Prediction Visualization
plt.figure(figsize=(12,6))
plt.plot(y[-100:], label='True')
plt.plot(lstm_preds[-100:], label='LSTM')
plt.plot(rf_model.predict(X_flat[-100:]), label='RF')
plt.legend()
plt.title('Final Predictions Comparison')
plt.show()

In [None]:
# Step 20: Dashboard
app = JupyterDash(__name__)

app.layout = html.Div([
    html.H1("Appointment Prediction Dashboard"),
    dcc.Graph(
        figure={
            'data': [
                go.Scatter(y=y, name='Actual'),
                go.Scatter(y=lstm_preds.flatten(), name='LSTM'),
                go.Scatter(y=rf_model.predict(X_flat), name='RF')
            ],
            'layout': go.Layout(title='Model Predictions')
        }
    ),
    html.Img(src='assets/shap_summary.png')
])

app.run_server(mode='inline')

In [None]:
# Step 21: Save models
lstm_model.save('/content/drive/MyDrive/models/lstm_model.h5')
import joblib
joblib.dump(rf_model, '/content/drive/MyDrive/models/rf_model.pkl')