In [None]:
import os
import pandas as pd

def load_data(data_dir='../data/raw', columns_to_load=None, date_parse_columns=['date']):
    """
    Load all CSV files from the specified directory and returns a list of dataframes
    ---
    Returns:
        dataframes: a list of dataframes; each dataframe is a patient's data loaded from the csv files
    """
    csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
    
    dataframes = []
    for file in csv_files:
        df = pd.read_csv(os.path.join(data_dir, file), usecols=columns_to_load, parse_dates=date_parse_columns)
        # df['patient_id'] = file.split('_')[0]  # Extract patient ID from filename
        dataframes.append(df)
    
    # combined_df = pd.concat(dataframes, ignore_index=True, axis=0)
    # combined_df = dataframes[0]
    print(f"Loaded data from {len(csv_files)} files.")
    return dataframes

In [None]:
from sklearn.preprocessing import OneHotEncoder

def preprocess_data(df):
    """
    Preprocess the combined DataFrame.
    ---
    1. Fill NaN values with 0
    2. Replace 0 with 'NULL' in the 'msg_type' column
    3. One hot encode the 'msg_type' column
    4. Drop irrelevant columns for the target variable 
    (in this case, only keeps 'ANNOUNCE_MEAL', 'INTERVENTION_SNACK', 'ANNOUNCE_EXERCISE', 'DOSE_INSULIN', 'DOSE_BASAL_INSULIN')
    5. Drop rows with invalid dates
    6. Change affects_fob and affects_iob to 1 and 0
    """
    df = df.fillna(0)
    df['msg_type'] = df['msg_type'].replace(0, 'NULL')

    # Convert 'date' column to datetime with a custom format
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S%z', errors='coerce', utc=True)
    
    # Drop rows where the date is null or invalid
    df = df.dropna(subset=['date'])
    
    # Sort by date and handle duplicate timestamps
    df = df.sort_values('date')
    
    # Set 'date' as the index
    df.set_index('date', inplace=True)
    # Change affects_fob and affects_iob to 1 and 0
    df['affects_fob'] = df['affects_fob'].apply(lambda x: 1 if x != 0 else 0)
    df['affects_iob'] = df['affects_iob'].apply(lambda x: 1 if x != 0 else 0)

    RELEVANT_MSG_TYPES = ['ANNOUNCE_MEAL', 'INTERVENTION_SNACK', 'ANNOUNCE_EXERCISE', 'DOSE_INSULIN', 'DOSE_BASAL_INSULIN']
    
    encoder = OneHotEncoder(categories='auto', sparse_output=False)
    encoded_data = encoder.fit_transform(df[['msg_type']])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['msg_type']), index=df.index)
    
    df = df.drop(columns=['msg_type'])
    df = pd.concat([df, encoded_df], axis=1)
    
    COLUMNS_TO_DROP = [col for col in df.columns if 'msg_type' in col and not any(msg_type in col for msg_type in RELEVANT_MSG_TYPES)]
    df.drop(columns=COLUMNS_TO_DROP, inplace=True)
    
    return df
