## Imports

In [82]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import os

from warnings import simplefilter, filterwarnings

filterwarnings("ignore", category=FutureWarning)
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action="ignore", category=pd.errors.SettingWithCopyWarning)

## Parameters

In [88]:
features = ['simple_sugars', 'complex_sugars', 'fats', 'dietary_fibers', 'proteins', 'fast_insulin', 'slow_insulin']
meal_features = ['simple_sugars', 'complex_sugars', 'fats', 'dietary_fibers', 'proteins']
insulin_features = ['fast_insulin', 'slow_insulin']
features_to_remove = ['glucose_next', 'datetime', 'patient', 'hour']

patients = ['001', '002', '004', '006', '007', '008']
approaches = ['nollm', 'pixtral-large-latest']
prediction_horizons = [6, 12]

## Functions

In [92]:
def get_projected_value(window, prediction_horizon):
    x = np.arange(len(window))
    coeffs = np.polyfit(x, window, deg=3)
    poly = np.poly1d(coeffs)
    projected_value = poly(len(window) + prediction_horizon)
    return projected_value

def get_data(patient, prediction_horizon):
    glucose_data = pd.read_csv(f"diabetes_subset_pictures-glucose-food-insulin/{patient}/glucose.csv")
    insulin_data = pd.read_csv(f"diabetes_subset_pictures-glucose-food-insulin/{patient}/insulin.csv")
    food_data = pd.read_csv(f"food_data/pixtral-large-latest/{patient}.csv")

    glucose_data["datetime"] = pd.to_datetime(glucose_data["date"] + ' ' + glucose_data["time"])
    glucose_data.drop(['type', 'comments', 'date', 'time'], axis=1, inplace=True)
    glucose_data['glucose'] *= 18.0182  

    insulin_data["datetime"] = pd.to_datetime(insulin_data["date"] + ' ' + insulin_data["time"])
    insulin_data.drop(['comment', 'date', 'time'], axis=1, inplace=True)

    food_data['datetime'] = pd.to_datetime(food_data['datetime'], format='%Y:%m:%d %H:%M:%S')
    food_data = food_data[['datetime', 'simple_sugars', 'complex_sugars', 'proteins', 'fats', 'dietary_fibers']]

    combined_data = pd.concat([food_data, insulin_data]).sort_values('datetime').reset_index(drop=True)
    combined_data.fillna(0, inplace=True)
    glucose_data['hour'] = glucose_data['datetime'].dt.hour

    glucose_data['glucose_next'] = glucose_data['glucose'] - glucose_data['glucose'].shift(-prediction_horizon)

    glucose_data['glucose_change'] = glucose_data['glucose'] - glucose_data['glucose'].shift(1)
    glucose_data[f'glucose_change_sh_3'] = glucose_data['glucose_change'].shift(3)

    for window in [2, 3, 6]:
        glucose_data[f'glucose_change_std_{window}'] = glucose_data['glucose_change'].rolling(window=window).std()
    
    delta = glucose_data['glucose'].diff(1)
    gain = delta.where(delta > 0, 0).rolling(window=6).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=6).mean()
    glucose_data['glucose_rsi'] = 100 - (100 / (1 + gain / loss))

    glucose_data['glucose_change_projected'] = glucose_data['glucose_change'].rolling(
        window=6, min_periods=6
    ).apply(lambda window: get_projected_value(window, prediction_horizon))
    glucose_data['glucose_projected'] = glucose_data['glucose'].rolling(
        window=6, min_periods=6
    ).apply(lambda window: get_projected_value(window, prediction_horizon))
    glucose_data.dropna(subset=['glucose_next'], inplace=True)
    return glucose_data, combined_data

def add_features(params, features, preprocessed_data, prediction_horizon):
    patients_glucose_data = []
    for patient in preprocessed_data.keys():
        glucose_data, combined_data = preprocessed_data[patient]
        
        glucose_times = glucose_data['datetime'].values.astype('datetime64[s]').astype(np.int64)
        combined_times = combined_data['datetime'].values.astype('datetime64[s]').astype(np.int64)
        for feature in features:
            metabolism_rate, peak_time = params[feature]
            time_diff_hours = ((glucose_times[:, None] - combined_times[None, :]) / 3600)
            weights = np.zeros_like(time_diff_hours)
            increase_mask = (time_diff_hours >= 0) & (time_diff_hours < peak_time)
            weights[increase_mask] = time_diff_hours[increase_mask] / peak_time
            decrease_mask = time_diff_hours >= peak_time
            weights[decrease_mask] = 1 - ((time_diff_hours[decrease_mask] - peak_time) * metabolism_rate)
            weights = np.clip(weights, 0, None)
            glucose_data[feature] = np.dot(weights, combined_data.loc[:, feature].values)
            glucose_data[feature] = glucose_data[feature] - glucose_data[feature].shift(-prediction_horizon) + glucose_data['glucose_change']

        glucose_data['patient'] = patient

        patients_glucose_data.append(glucose_data)
    patients_glucose_data = pd.concat(patients_glucose_data)
    patients_glucose_data.dropna(inplace=True)
    return patients_glucose_data

def train_and_evaluate_by_hour(patient, hour, model, train, test, features_to_remove, callbacks, train_size, patient_weight, prior_hour_weight):
    """Train model for specific hour and evaluate on test data"""
    weights = np.ones(len(train))
    patient_mask = train['patient'] == patient
    prior_hour_mask = train['hour'] < hour
    weights[patient_mask] = patient_weight
    weights[patient_mask & prior_hour_mask] = prior_hour_weight
    X_test = test.drop(features_to_remove, axis=1)
    X_train, X_val, y_train, y_val, weights_train, weights_val = train_test_split(
        train.drop(features_to_remove, axis=1), train['glucose_next'], weights, train_size=train_size, random_state=42
    )
    model.fit(X_train, y_train,
              sample_weight=weights_train,
              eval_set=[(X_val, y_val)], 
              eval_sample_weight=[weights_val],
              callbacks=callbacks)
    
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(test['glucose_next'], preds))
    return rmse, preds

## Settings

In [93]:
train_size = 0.95

lgb_params = {
    'max_depth': 3,
    'n_estimators': 1000, 
    'learning_rate': 0.05,
    'objective': 'regression',
    'random_state': 1,
    'deterministic': True,
    'verbosity': -1
}

model = lgb.LGBMRegressor(**lgb_params)
callbacks = [lgb.early_stopping(stopping_rounds=20, verbose=False)]  

feature_params = {
    'simple_sugars': [0.4, 0.25], 
    'complex_sugars': [0.4, 0.5], 
    'proteins': [0.2, 4], 
    'fats': [0.1, 3], 
    'dietary_fibers': [0.05, 3.5], 
    'fast_insulin': [0.5, 0.5], 
    'slow_insulin': [0.5, 1]
}

# Results

In [94]:
df = pd.DataFrame(columns=['Approach', 'Prediction Horizon', 'Patient', 'Day', 'Hour', 'RMSE']) 

for approach in approaches:
    for prediction_horizon in prediction_horizons:
        data = {patient: get_data(patient, prediction_horizon) for patient in patients}
        processed_data = add_features(feature_params, features, data, prediction_horizon)
        if approach == 'nollm':
            processed_data.drop(meal_features, axis=1, inplace=True)
        
        for patient in patients:
            patient_mask = processed_data['patient'] == patient
            all_test = processed_data[patient_mask]
            other_patients = processed_data[~patient_mask]
            all_preds = []
            all_test_data = []
            
            days = all_test['datetime'].dt.day.unique()
            for day in days:
                day_mask = all_test['datetime'].dt.day == day
                test_day = all_test[day_mask]
                other_days = all_test[~day_mask]
                
                hours = test_day['hour'].unique()
                
                for hour in hours:
                    hour_model = lgb.LGBMRegressor(**lgb_params)
                    hour_mask = test_day['hour'] == hour
                    test = test_day[hour_mask]
                    prior_hours_mask = (test_day['hour'] < hour - 1) & (test_day['datetime'].dt.day == day)
                    prior_hours = test_day[prior_hours_mask]
                    train = pd.concat([prior_hours, other_days, other_patients])
                    hour_rmse, hour_preds = train_and_evaluate_by_hour(
                        patient, hour, hour_model, train, test, 
                        features_to_remove, callbacks, train_size=0.95, patient_weight=5, prior_hour_weight=10
                    )
                    all_preds.append(hour_preds)
                    all_test_data.append(test)
                    
                    new_entry = pd.DataFrame({
                        'Approach': [approach],
                        'Prediction Horizon': [prediction_horizon],
                        'Patient': [patient],
                        'Day': [day],
                        'Hour': [hour],
                        'RMSE': [hour_rmse]
                    })
                    df = pd.concat([df, new_entry], ignore_index=True)
            processed_data.to_csv(f'datasets/{approach}_{prediction_horizon}.csv', index=False)
            if all_preds and all_test_data:
                combined_preds = np.concatenate(all_preds)
                combined_test = pd.concat(all_test_data)
                predictions = pd.DataFrame({
                    'Predictions': combined_test['glucose'] - combined_preds, 
                    'Ground_truth': combined_test['glucose'] - combined_test['glucose_next'], 
                    'Datetime': combined_test['datetime']
                })
                os.makedirs(f'predictions/{approach}/{prediction_horizon}', exist_ok=True)
                predictions.to_csv(f'predictions/{approach}/{prediction_horizon}/{patient}_predictions.csv', index=False)

        selected_df = df[(df['Approach'] == approach) & (df['Prediction Horizon'] == prediction_horizon)]
        current_rmse = selected_df['RMSE'].mean()
        print(f"Average RMSE for {approach}, prediction horizon {prediction_horizon}: {current_rmse:.4f}")
df.to_csv('results.csv', index=False)

Average RMSE for nollm, prediction horizon 6: 16.2921
Average RMSE for nollm, prediction horizon 12: 32.9504
Average RMSE for pixtral-large-latest, prediction horizon 6: 16.0700
Average RMSE for pixtral-large-latest, prediction horizon 12: 31.9592


In [95]:
df.to_csv('results.csv', index=False)