In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import os

In [None]:
def create_sliding_windows(data, window_size=14):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:(i + window_size)])
        y.append(data[i + window_size, 0])  # First column is COUNT_OF_APPOINTMENTS
    return np.array(X), np.array(y)

In [None]:
# Load and merge datasets with corrected path
current_dir = os.path.dirname(__file__)
project_root = os.path.dirname(os.path.dirname(current_dir))
data_dir = os.path.join(project_root, 'Practice_Level_Crosstab_Jan_24')
files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
dfs = [pd.read_csv(os.path.join(data_dir, f)) for f in files]
# Debug column names and find correct date column
print("First CSV columns:", dfs[0].columns.tolist())
merged = pd.concat(dfs)

First CSV columns: ['APPOINTMENT_MONTH_START_DATE', 'GP_CODE', 'GP_NAME', 'SUPPLIER', 'PCN_CODE', 'PCN_NAME', 'SUB_ICB_LOCATION_CODE', 'SUB_ICB_LOCATION_NAME', 'HCP_TYPE', 'APPT_MODE', 'NATIONAL_CATEGORY', 'TIME_BETWEEN_BOOK_AND_APPT', 'COUNT_OF_APPOINTMENTS', 'APPT_STATUS']


In [None]:
# Use verified column names from dataset
merged = merged.sort_values('APPOINTMENT_MONTH_START_DATE')

In [None]:
# Convert date with correct format
merged['Date'] = pd.to_datetime(merged['APPOINTMENT_MONTH_START_DATE'], format='%d%b%Y')

In [None]:
# Feature engineering
merged['day_of_week'] = merged['Date'].dt.dayofweek
merged['is_weekend'] = merged['day_of_week'].isin([5,6]).astype(int)

In [None]:
# Select features and target based on actual columns
features = ['COUNT_OF_APPOINTMENTS', 'day_of_week', 'is_weekend']
target = 'COUNT_OF_APPOINTMENTS'
data = merged[features].values

In [None]:
# Normalize
scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)

In [None]:
# Create sequences
X, y = create_sliding_windows(scaled_data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
# Save processed data
output_dir = os.path.join(current_dir, '../../data')
os.makedirs(output_dir, exist_ok=True)
np.save(os.path.join(output_dir, 'X_train.npy'), X_train)
np.save(os.path.join(output_dir, 'X_test.npy'), X_test)
np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
np.save(os.path.join(output_dir, 'y_test.npy'), y_test)