## Imports

In [200]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from warnings import simplefilter, filterwarnings

filterwarnings("ignore", category=FutureWarning)
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action="ignore", category=pd.errors.SettingWithCopyWarning)

## Parameters

In [201]:
patients = ['001', '002', '004', '006', '007', '008']
features = ['simple_sugars', 'complex_sugars', 'fats', 'dietary_fibers', 'proteins', 'fast_insulin', 'slow_insulin']
insulin_features = ['fast_insulin', 'slow_insulin']
approaches = ['gpt4o', 'nollm']
prediction_horizons = [6, 12]
verbose = 0

## Functions

In [202]:
def get_projected_value(window, prediction_horizon):
    x = np.arange(len(window))
    coeffs = np.polyfit(x, window, deg=3)
    poly = np.poly1d(coeffs)
    projected_value = poly(len(window) + prediction_horizon)
    return projected_value

def get_data(patient, food_data_source, prediction_horizon):
    glucose_data = pd.read_csv(f"diabetes_subset_pictures-glucose-food-insulin/{patient}/glucose.csv")
    insulin_data = pd.read_csv(f"diabetes_subset_pictures-glucose-food-insulin/{patient}/insulin.csv")
    food_data = pd.read_csv(f"food_data/{food_data_source}/{patient}.csv")

    glucose_data["datetime"] = pd.to_datetime(glucose_data["date"] + ' ' + glucose_data["time"])
    glucose_data.drop(['type', 'comments', 'date', 'time'], axis=1, inplace=True)
    glucose_data['glucose'] *= 18.0182  # Convert to mg/dL

    insulin_data["datetime"] = pd.to_datetime(insulin_data["date"] + ' ' + insulin_data["time"])
    insulin_data.drop(['comment', 'date', 'time'], axis=1, inplace=True)

    food_data['datetime'] = pd.to_datetime(food_data['datetime'], format='%Y:%m:%d %H:%M:%S')
    food_data = food_data[['datetime', 'simple_sugars', 'complex_sugars', 'proteins', 'fats', 'dietary_fibers']]

    combined_data = pd.concat([food_data, insulin_data]).sort_values('datetime').reset_index(drop=True)
    combined_data.fillna(0, inplace=True)
    glucose_data['hour'] = glucose_data['datetime'].dt.hour

    glucose_data['glucose_next'] = glucose_data['glucose'] - glucose_data['glucose'].shift(-prediction_horizon)

    glucose_data['glucose_change'] = glucose_data['glucose'] - glucose_data['glucose'].shift(1)

    glucose_data[f'glucose_change_sh_3'] = glucose_data['glucose_change'].shift(3)

    for window in [2, 3, 6]:
        glucose_data[f'glucose_change_std_{window}'] = glucose_data['glucose_change'].rolling(window=window).std()
    
    delta = glucose_data['glucose'].diff(1)
    gain = delta.where(delta > 0, 0).rolling(window=6).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=6).mean()
    glucose_data['glucose_rsi'] = 100 - (100 / (1 + gain / loss))

    glucose_data['glucose_change_projected'] = glucose_data['glucose_change'].rolling(
        window=6, min_periods=6
    ).apply(lambda window: get_projected_value(window, prediction_horizon))
    glucose_data['glucose_projected'] = glucose_data['glucose'].rolling(
        window=6, min_periods=6
    ).apply(lambda window: get_projected_value(window, prediction_horizon))
    glucose_data.dropna(subset=['glucose_next'], inplace=True)
    return glucose_data, combined_data

def calculate_weights(time_diff_hours, metabolism_rate, peak_time):
    weights = np.zeros_like(time_diff_hours)
    increase_mask = (time_diff_hours >= 0) & (time_diff_hours < peak_time)
    plateau_duration = 0.25
    weights[increase_mask] = time_diff_hours[increase_mask] / peak_time
    plateau_mask = (time_diff_hours >= peak_time) & (time_diff_hours < peak_time + plateau_duration)
    weights[plateau_mask] = 1
    decrease_mask = time_diff_hours >= peak_time + plateau_duration
    weights[decrease_mask] = 1 - ((time_diff_hours[decrease_mask] - peak_time - plateau_duration) * metabolism_rate)
    return np.clip(weights, 0, None)


def add_features(params, features, preprocessed_data, prediction_horizon):
    patients_glucose_data = []
    for patient in patients:
        glucose_data, combined_data = preprocessed_data[patient]
        
        glucose_times = glucose_data['datetime'].values.astype('datetime64[s]').astype(np.int64)
        combined_times = combined_data['datetime'].values.astype('datetime64[s]').astype(np.int64)
        for feature in features:
            metabolism_rate, peak_time = params[feature]
            time_diff_hours = ((glucose_times[:, None] - combined_times[None, :]) / 3600)
            weights = np.zeros_like(time_diff_hours)
            increase_mask = (time_diff_hours >= 0) & (time_diff_hours < peak_time)
            weights[increase_mask] = time_diff_hours[increase_mask] / peak_time
            plateau_duration = 0.25
            plateau_mask = (time_diff_hours >= peak_time) & (time_diff_hours < peak_time + plateau_duration)
            weights[plateau_mask] = 1
            decrease_mask = time_diff_hours >= peak_time + plateau_duration
            weights[decrease_mask] = 1 - ((time_diff_hours[decrease_mask] - peak_time - plateau_duration) * metabolism_rate)
            weights = np.clip(weights, 0, None)
            glucose_data[feature] = np.dot(weights, combined_data.loc[:, feature].values)
            glucose_data[feature] = glucose_data[feature] - glucose_data[feature].shift(-prediction_horizon) + glucose_data['glucose_change']

        glucose_data['patient'] = patient

        patients_glucose_data.append(glucose_data)
    patients_glucose_data = pd.concat(patients_glucose_data)
    patients_glucose_data.dropna(inplace=True)
    return patients_glucose_data

## Results

In [209]:
feature_params = {
    'simple_sugars': [0.4, 0.5],  # [metabolism_rate_param, peak_time]
    'complex_sugars': [0.3, 0.5],
    'proteins': [0.2, 3.5],
    'fats': [0.05, 3.5], 
    'dietary_fibers': [0.05, 3.5],
    'fast_insulin': [1.0, 0.5], 
    'slow_insulin': [0.5, 1.0]
}

train_size = 0.9

lgb_params = {
    'max_depth': 3,
    'n_estimators': 1000, 
    'reg_lambda': 5,
    'learning_rate': 0.02,
    'objective': 'regression',
    'random_state': 1,
    'deterministic': True,
    'verbosity': -1
}

model = lgb.LGBMRegressor(**lgb_params)  
callbacks = [lgb.early_stopping(stopping_rounds=20, verbose=False)]  

df = pd.DataFrame(columns=['Approach', 'Prediction Horizon', 'Patient', 'RMSE'])

for approach in approaches:
    for prediction_horizon in prediction_horizons:
        if approach == 'nollm':
            data = {patient: get_data(patient, 'gpt4o', prediction_horizon) for patient in patients}
            processed_data = add_features(feature_params, insulin_features, data, prediction_horizon)
        else:
            data = {patient: get_data(patient, approach, prediction_horizon) for patient in patients}
            processed_data = add_features(feature_params, features, data, prediction_horizon)
        
        for patient in patients:
            patient_mask = processed_data['patient'] == patient
            all_test = processed_data[patient_mask]
            other_patient_train = processed_data[~patient_mask]
            features_to_remove = ['glucose_next', 'datetime', 'patient']
            preds = np.zeros(len(all_test))
            days = all_test['datetime'].dt.day.unique()
            for day in days:
                day_mask = all_test['datetime'].dt.day == day
                test = all_test[day_mask]
                other_day_test = all_test[~day_mask]
                train = pd.concat([other_patient_train, other_day_test])
                weights = np.ones(len(train))
                weights[train['patient'] == patient] = 5
                X_test = test.drop(features_to_remove, axis=1)
                X_train, X_val, y_train, y_val, weights_train, weights_val = train_test_split(
                    train.drop(features_to_remove, axis=1), train['glucose_next'], weights, train_size=train_size, random_state=42
                )

                ## Regression
                model.fit(X_train, y_train,
                        sample_weight=weights_train,
                        eval_set=[(X_val, y_val)], 
                        eval_sample_weight=[weights_val],
                        callbacks=callbacks)
                joblib.dump(model, f'models/{approach}/{prediction_horizon}_{day}_{patient}.joblib')
                preds[day_mask] = model.predict(X_test)

            rmse = np.sqrt(mean_squared_error(all_test['glucose_next'], preds))
            mae = mean_absolute_error(all_test['glucose_next'], preds)
            new_entry = pd.DataFrame({
                'Approach': [approach],
                'Prediction Horizon': [prediction_horizon],
                'Patient': [patient],
                'RMSE': [rmse],
            })
            df = pd.concat([df, new_entry], ignore_index=True)
            processed_data.to_csv(f'datasets/{approach}_{prediction_horizon}.csv', index=False)
            predictions = pd.DataFrame({'Predictions': all_test['glucose'] - preds, 'Ground_truth': all_test['glucose'] - all_test['glucose_next'], 'Datetime': all_test['datetime']})
            predictions.to_csv(f'predictions/{approach}/{prediction_horizon}/{patient}_predictions.csv', index=False)
        selected_df = df[(df['Approach'] == approach) & (df['Prediction Horizon'] == prediction_horizon)]
        print(f"RMSE: {selected_df['RMSE'].mean():.4f}, {approach}, {prediction_horizon}")
df.to_csv('results.csv', index=False)

RMSE: 18.9277, gpt4o, 6
RMSE: 37.7882, gpt4o, 12
RMSE: 19.2909, nollm, 6
RMSE: 38.3015, nollm, 12
