<h1>Imports</h1>

In [None]:
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, TimeDistributed, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
from keras_tuner.tuners import Hyperband

<h1>Logging</h1>

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)

<h1>Function Declarations</h1>

In [None]:
def load_data(data):
    try:
        df = pd.read_csv(data)
    except (FileNotFoundError, pd.errors.EmptyDataError, pd.errors.ParserError) as e:
        logging.error(f"Error: {str(e)}.")
        exit(1)
    return df

In [None]:
def identify_data_types(df, unique_value_threshold=10):
    numerical_columns = []
    categorical_columns = []
    for column in df.columns:
        # If the column is of object type, it's categorical
        if df[column].dtype == 'object':
            categorical_columns.append(column)
        else:
            # If the number of unique values is less than the threshold, consider it categorical
            if df[column].nunique() <= unique_value_threshold:
                categorical_columns.append(column)
            else:
                numerical_columns.append(column)
    return numerical_columns, categorical_columns

In [None]:
def normalize_and_encode(df, numerical_cols, categorical_cols):
    # Define the transformations for numerical and categorical columns
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    # Create the column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    # Fit and transform the data
    df_transformed = preprocessor.fit_transform(df)
    # Get feature names after one-hot encoding
    one_hot_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
    all_feature_names = numerical_cols + list(one_hot_feature_names)
    # Create a new DataFrame with the transformed data
    df_transformed = pd.DataFrame(df_transformed, columns=all_feature_names)
    return df_transformed

In [None]:
def handle_missing_values(df):
    # Create missingness indicators
    missing_indicators = df.isna().astype(int).add_suffix('_missing')
    # Apply forward fill imputation
    df_imputed = df.fillna(method='ffill').fillna(method='bfill')
    # Combine imputed data with missingness indicators
    df_combined = pd.concat([df_imputed, missing_indicators], axis=1)
    return df_combined

In [None]:
def feature_selection(df, target_column='ICU', threshold=0.01, additional_cols=['PATIENT_VISIT_IDENTIFIER', 'WINDOW']):
    # Handling missing values - consider modifying this as per your dataset's requirements
    df = df.dropna()
    # Separating the target variable
    y = df[target_column]
    # Separating the additional columns
    additional_data = df[additional_cols]
    # Dropping the target and additional columns from the main DataFrame
    df = df.drop(additional_cols + [target_column], axis=1, errors='ignore')
    # Encoding categorical variables if any
    df_encoded = pd.get_dummies(df, drop_first=True)
    # Using RandomForestClassifier for feature importance
    model = RandomForestClassifier()
    model.fit(df_encoded, y)
    # Selecting features based on importance
    sfm = SelectFromModel(model, threshold=threshold)
    sfm.fit(df_encoded, y)
    # Getting the selected feature names
    feature_names = df_encoded.columns[sfm.get_support()]
    # Creating a DataFrame with selected features and additional columns
    df_selected = df_encoded[feature_names].join(additional_data).join(y)
    return df_selected, feature_names

In [None]:
def prepare_sequence_data(df, patient_identifier_col, target_col, window_col):
    # Sort the dataframe by patient visit identifier and window
    df_sorted = df.sort_values(by=[patient_identifier_col, window_col])
    # Group by patient visit identifier
    grouped = df_sorted.groupby(patient_identifier_col)
    X = []  # To store sequences
    y = []  # To store labels (ICU admission status)
    for _, group in grouped:
        # Find the first instance of ICU admission
        first_icu_admission = group[target_col].cumsum().shift(fill_value=0).eq(1)
        # Exclude data after the first ICU admission
        group = group[~first_icu_admission]
        # Drop columns that are not features (like identifiers and target)
        features = group.drop(columns=[patient_identifier_col, target_col, window_col])
        # Append the sequence of features to X
        X.append(features.values)
        # Append the label (ICU admission status) to y
        y.append(group[target_col].max())  # max() ensures we capture if the patient was ever admitted to ICU
    return X, y

In [None]:
def create_lstm_cnn_model(input_shape, num_classes, filters, kernel_size, lstm_units, dense_units, dropout_rate, learning_rate):
    model = Sequential()
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', input_shape=input_shape))
    # Add MaxPooling1D only if the input is large enough
    if input_shape[0] > kernel_size:
        model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(lstm_units, activation='tanh'))
    model.add(Flatten())
    model.add(Dense(dense_units, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(num_classes, activation='softmax' if num_classes > 1 else 'sigmoid'))
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
def tune_hyperparameters(input_shape, num_classes, X_train, y_train, X_val, y_val):
    def model_builder(hp):
        hp_filters = hp.Int('filters', min_value=16, max_value=64, step=16)
        hp_kernel_size = hp.Choice('kernel_size', values=[3, 5])
        hp_lstm_units = hp.Int('lstm_units', min_value=30, max_value=100, step=10)
        hp_dense_units = hp.Int('dense_units', min_value=16, max_value=64, step=16)
        hp_dropout_rate = hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)
        hp_learning_rate = hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')

        return create_lstm_cnn_model(input_shape, num_classes, hp_filters, hp_kernel_size, hp_lstm_units, hp_dense_units, hp_dropout_rate, hp_learning_rate)

    tuner = Hyperband(
        model_builder,
        objective='val_accuracy',
        max_epochs=50, 
        factor=3,
        directory='my_dir',
        project_name='lstm_cnn_tuning_hyperband')

    tuner.search(X_train, y_train, epochs=50, validation_data=(X_val, y_val))

    return tuner.get_best_hyperparameters()[0]

In [None]:
def cross_validate_model(X, y, input_shape, num_classes, best_hps, n_folds=5, epochs=50, batch_size=32):
    # Define K-fold cross-validator
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    # Initialize results list
    results = []
    # K-fold cross-validation
    for train_index, val_index in kf.split(X):
        # Split data
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        # Create and compile the model with the best hyperparameters
        model = create_lstm_cnn_model(
            input_shape=input_shape, 
            num_classes=num_classes, 
            filters=best_hps.get('filters'), 
            kernel_size=best_hps.get('kernel_size'), 
            lstm_units=best_hps.get('lstm_units'), 
            dense_units=best_hps.get('dense_units'), 
            dropout_rate=best_hps.get('dropout_rate'), 
            learning_rate=best_hps.get('learning_rate'))
        # EarlyStopping callback
        early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5, restore_best_weights=True)
        # Fit the model
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=[early_stopping_callback])
        # Evaluate the model
        loss, accuracy = model.evaluate(X_val, y_val)
        results.append(accuracy)

    # Calculate average performance across all folds
    average_performance = np.mean(results)
    return average_performance

In [None]:
def train_model(X_train, y_train, X_val, y_val, input_shape, num_classes, epochs=50, batch_size=32, learning_rate=0.001):
    # Create the model
    model = create_lstm_cnn_model(input_shape, num_classes, learning_rate=learning_rate)
    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    # Train the model with early stopping
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                        epochs=epochs, batch_size=batch_size, callbacks=[early_stopping])
    return model, history

In [None]:
def evaluate_model(model, X_test, y_test):
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1) if y_pred.shape[1] > 1 else (y_pred > 0.5).astype('int32')

    # Classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred_classes))

In [None]:
def main():
    print("Loading the dataset...")
    df = load_data("COVID-Full.csv")
    print("Identifying data types...")
    numerical_cols, categorical_cols = identify_data_types(df)
    print("Normalizing and encoding data...")
    df_encoded = normalize_and_encode(df, numerical_cols, categorical_cols)
    print("Handling missing values...")
    df_no_missing = handle_missing_values(df_encoded)
        # Check if 'PATIENT_VISIT_IDENTIFIER', 'ICU', 'WINDOW' are in df_no_missing
    if 'PATIENT_VISIT_IDENTIFIER' not in df_no_missing.columns:
        df_no_missing['PATIENT_VISIT_IDENTIFIER'] = df['PATIENT_VISIT_IDENTIFIER']
    if 'ICU' not in df_no_missing.columns:
        df_no_missing['ICU'] = df['ICU']
    if 'WINDOW' not in df_no_missing.columns:
        df_no_missing['WINDOW'] = df['WINDOW']
    print("Preparing sequence data...")
    X, y = prepare_sequence_data(df_no_missing, 'PATIENT_VISIT_IDENTIFIER', 'ICU', 'WINDOW')
    print("Converting data to numpy arrays...")
    X_array = np.array(X, dtype=object)
    y_array = np.array(y)
    print("Padding sequences...")
    max_sequence_length = max(len(sequence) for sequence in X_array)
    feature_size = X_array[0].shape[1]
    X_padded = np.array([np.pad(sequence, ((0, max_sequence_length - len(sequence)), (0, 0)), mode='constant', constant_values=0) for sequence in X_array])
    print("Splitting data into training and validation sets...")
    X_train, X_val, y_train, y_val = train_test_split(X_padded, y_array, test_size=0.2, random_state=42)
    print("Determining model parameters...")
    input_shape = (max_sequence_length, feature_size)
    num_classes = 1
    print("Performing K-Fold Cross-Validation...")
    average_accuracy = cross_validate_model(X_padded, y_array, input_shape, num_classes, n_folds=5, epochs=10, batch_size=32, learning_rate=0.001)
    print(f"Average Accuracy across folds: {average_accuracy}")
    print("Training the model...")
    model, history = train_model(X_train, y_train, X_val, y_val, input_shape, num_classes)
    print("Evaluating the model...")
    evaluate_model(model, X_val, y_val)
    print("Process completed.")

<h1>Script Start</h1>

In [None]:
if __name__ == "__main__":
    main()