In [27]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Data Import

In [28]:
def prepare_dataset(file_path):
    df = pd.read_pickle(file_path)

    X = pd.concat([df['enrollment'],
                   df['description_embedding'].apply(pd.Series),
                   df['inclusion_embedding'].apply(pd.Series),
                   df['exclusion_embedding'].apply(pd.Series),
                   df['treatment_embedding'].apply(pd.Series),
                   df['disease_embedding'].apply(pd.Series),
                   df['measures_embedding'].apply(pd.Series),
                   df['timeframes_embedding'].apply(pd.Series)], axis=1)
    y = df['durationMonths']

    return X, y

X_train, y_train = prepare_dataset('./data_example/train_df.pkl')
X_test, y_test = prepare_dataset('./data_example/test_df.pkl')
X_incompleted, y_incompleted = prepare_dataset('./data_example/incompleted_df.pkl')

In [29]:
# standardize enrollment
scaler = StandardScaler()
X_train_enroll_scaled = scaler.fit_transform(X_train[['enrollment']])
X_train_scaled = np.concatenate([X_train_enroll_scaled, X_train.iloc[:, 1:].values], axis=1)

X_test_enroll_scaled = scaler.transform(X_test[['enrollment']])
X_test_scaled = np.concatenate([X_test_enroll_scaled, X_test.iloc[:, 1:].values], axis=1)

X_incompleted_enroll_scaled = scaler.transform(X_incompleted[['enrollment']])
X_incompleted_scaled = np.concatenate([X_incompleted_enroll_scaled, X_incompleted.iloc[:, 1:].values], axis=1)

# Linear Regression

In [98]:
# modeling training
model_ridge = Ridge()
param_grid = {'alpha': [0.1, 1.0, 10.0]}

grid_search = GridSearchCV(
    estimator=model_ridge,
    param_grid=param_grid, cv=5,
    scoring='neg_mean_squared_error'
)
grid_search.fit(X_train_scaled, y_train)
best_model_ridge = grid_search.best_estimator_

# Evaluation
y_pred_ridge = best_model_ridge.predict(X_test_scaled)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)


# Random Forest

In [104]:
# Initialize the model
model_rf = RandomForestRegressor(random_state=42)

# Tune the parameter
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [1, 5, 10]
}

random_search = RandomizedSearchCV(
    estimator=model_rf,
    param_distributions=param_grid, 
    n_iter=10, cv=5,
    scoring='neg_mean_squared_error',
    random_state=42
)

random_search.fit(X_train_scaled, y_train)
best_model_rf = random_search.best_estimator_

# Evaluation
y_pred_rf = best_model_rf.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)

# XGBoost

In [13]:
import xgboost as xgb
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
# Initialize the XGBoost regressor
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Tune the parameter
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]
}

random_search = RandomizedSearchCV(
    estimator=model_xgb,
    param_distributions=param_grid, 
    n_iter=10, cv=5,
    scoring='neg_mean_squared_error',
    random_state=42
)

random_search.fit(X_train_scaled, y_train)
best_model_xgb = random_search.best_estimator_

# Evaluation
y_pred_xgb = best_model_xgb.predict(X_test_scaled)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)

In [19]:
# Create an explainer object using the XGBoost model
explainer = shap.Explainer(best_model_xgb)

# Compute SHAP values
shap_values = explainer(X_train_scaled)

In [20]:
# Aggregate SHAP values for groups of features
sections = ['enrollment', 'description', 'inclusion', 'exclusion',
            'treatment', 'disease', 'measure', 'timeframe']

section_shap_values = {}

for i, section in enumerate(sections):
    if section == 'enrollment':
        section_shap_value = shap_values.values[:, 0]
    else:
        start_idx = (i-1) * 768 + 1
        end_idx = start_idx + 768
        section_shap_value = np.sum(shap_values.values[:, start_idx:end_idx], axis=1)
    
    section_shap_values[section] = section_shap_value

section_shap_values = pd.DataFrame(section_shap_values)

# FFNN

In [15]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ModelCheckpoint

In [18]:
# Initialize the model
model_ffnn = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile the model
model_ffnn.compile(optimizer='adam', loss='mean_squared_error')

# Define callbacks (optional but recommended for saving the best model)
checkpoint = ModelCheckpoint('./results/ffnn_model.keras', monitor='val_loss',
                             save_best_only=True, mode='min', verbose=1)

# Train the model
model_ffnn.fit(X_train_scaled, y_train,
               epochs=50, batch_size=32,
               validation_data=(X_test_scaled, y_test),
               callbacks=[checkpoint])

# Load the best saved model
model_ffnn = load_model('./results/ffnn_model.keras')

# Predict on the test set
y_pred_ffnn = model_ffnn.predict(X_test_scaled)

Epoch 1/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 336ms/step - loss: 229.3563
Epoch 1: val_loss improved from inf to 6.98238, saving model to ffnn_model.keras
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - loss: 221.2915 - val_loss: 6.9824
Epoch 2/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 19ms/step - loss: 125.8220
Epoch 2: val_loss did not improve from 6.98238
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 122.7308 - val_loss: 60.2768
Epoch 3/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 12ms/step - loss: 107.9508
Epoch 3: val_loss did not improve from 6.98238
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 107.6185 - val_loss: 83.8462
Epoch 4/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 12ms/step - loss: 117.2397
Epoch 4: val_loss did not improve from 6.98238
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

# CNN

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

In [22]:
# Initialize the model
model_cnn = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_scaled.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(50, activation='relu'),
    Dense(1)
])

# Compile the model
model_cnn.compile(optimizer='adam', loss='mean_squared_error')

# Define callbacks (optional)
checkpoint = ModelCheckpoint('./results/cnn_model.keras', monitor='val_loss',
                             save_best_only=True, mode='min', verbose=1)

# Train the model
model_cnn.fit(X_train_scaled, y_train,
              epochs=50, batch_size=32,
              validation_data=(X_test_scaled, y_test),
              callbacks=[checkpoint])

# Load the best saved model
model_cnn = load_model('./results/cnn_model.keras')

# Predict on the test set
y_pred_cnn = model_cnn.predict(X_test_scaled)


Epoch 1/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 513ms/step - loss: 219.4137
Epoch 1: val_loss improved from inf to 371.97931, saving model to ./results/cnn_model.keras
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 343ms/step - loss: 215.7005 - val_loss: 371.9793
Epoch 2/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 74ms/step - loss: 289.8398
Epoch 2: val_loss improved from 371.97931 to 57.50670, saving model to ./results/cnn_model.keras
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 223ms/step - loss: 280.7099 - val_loss: 57.5067
Epoch 3/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 67ms/step - loss: 100.7802
Epoch 3: val_loss improved from 57.50670 to 27.70893, saving model to ./results/cnn_model.keras
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 222ms/step - loss: 96.5378 - val_loss: 27.7089
Epoch 4/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 67ms/step -

# LSTM

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [24]:
# Initialize the model
model_lstm = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train_scaled.shape[1], 1)),
    Dense(1)
])

# Compile the model
model_lstm.compile(optimizer='adam', loss='mean_squared_error')

# Define callbacks (optional)
checkpoint = ModelCheckpoint('./results/lstm_model.keras', monitor='val_loss',
                             save_best_only=True, mode='min', verbose=1)

# Train the model
model_cnn.fit(X_train_scaled, y_train,
              epochs=50, batch_size=32,
              validation_data=(X_test_scaled, y_test),
              callbacks=[checkpoint])

# Load the best saved model
model_cnn = load_model('./results/lstm_model.keras')

# Predict on the test set
y_pred_cnn = model_cnn.predict(X_test_scaled)

Epoch 1/50


  super().__init__(**kwargs)


[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 278ms/step - loss: 126.9582
Epoch 1: val_loss improved from inf to 31.74744, saving model to ./results/lstm_model.keras
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 282ms/step - loss: 121.5997 - val_loss: 31.7474
Epoch 2/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 69ms/step - loss: 82.2133
Epoch 2: val_loss did not improve from 31.74744
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 83.5983 - val_loss: 76.9759
Epoch 3/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 71ms/step - loss: 93.6070
Epoch 3: val_loss did not improve from 31.74744
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 97.3133 - val_loss: 64.2210
Epoch 4/50
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 76ms/step - loss: 87.7614
Epoch 4: val_loss improved from 31.74744 to 22.79032, saving model to ./results/lstm_model.keras
[