## Fill in the missing parameters  
### These are the following parameters: 
    - Blood Glucose 
    - Insulin 
    - Carbohydrates 
    - Heart Rate 
    - Steps 
    - Calories 
    - Activity 
### Approaching Method:
1. If a category of a row: 
    - has more than 20% of parameters ⇒ do a KNN Imputer (n-3)
    - has less than 20% of parameters ⇒ do a KNN Imputer (n=5)

In [554]:
import pandas as pd 
import os
import numpy as np 
from sklearn.impute import KNNImputer 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, accuracy_score
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import LabelEncoder

## Import Training CSV Data 

In [556]:
# Generate Selected Data 
def specific_col_dataframe(dataframe, selected_columns_key): 
    if selected_columns_key == 'bg':
    # Select blood glucose columns but exclude the last column of predicted values 
        columns = [col for col in dataframe.columns if selected_columns_key in col][:-1] 
        selected_columns = dataframe[columns] 
    else: 
        columns = [col for col in dataframe.columns if selected_columns_key in col]
        selected_columns = dataframe[columns] 
    
    return columns, selected_columns 
    
# Generate Selected Data Transpose 
def transpose(dataframe, selected_columns_key): 
    columns, selected_columns = specific_col_dataframe(dataframe, selected_columns_key)
    # Copy the columns for imputation 
    imputed = selected_columns.copy()
    # Transpose the data 
    imputed_T = imputed.T 

    return columns, imputed_T

In [557]:
# _, first_dataframe = list(dataframes.items())[0] 

# print(first_dataframe.head())
# print(first_dataframe.shape)

In [558]:
def non_missing_percentage(column): 
    return column.notnull().mean() 

def partition_columns(transpose_imputed_data):
    # Check each column for having more than 20% non-missing data 
    columns_with_sufficient_data = [
        col for col in transpose_imputed_data.columns if non_missing_percentage(transpose_imputed_data[col]) > 0.2 ] 
    
    columns_with_non_sufficient_data = [
        col for col in transpose_imputed_data.columns if non_missing_percentage(transpose_imputed_data[col]) <= 0.2 ] 

    # Filter the DataFrame to only include these columns 
    sufficient_data_df = transpose_imputed_data[columns_with_sufficient_data] 
    
    non_sufficient_data_df = transpose_imputed_data[columns_with_non_sufficient_data] 
    
    return sufficient_data_df, non_sufficient_data_df 

def imputation(imputed_bg_T, sufficient_kNeighbors, non_sufficient_kNeighbors): 

    # check for columns that are completely empty 
    empty_columns = imputed_bg_T.isnull().all() 

    # transpose the columns to prevent non-data error
    if empty_columns.any(): 
        imputed_bg_T = imputed_bg_T.T
    
    sufficient_data_df, non_sufficient_data_df = partition_columns(imputed_bg_T)  

    # Ensure numeric data types
    sufficient_data_df = sufficient_data_df.apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all')
    non_sufficient_data_df = non_sufficient_data_df.apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all')

    imputed_bg_T_suff = pd.DataFrame() 
    imputed_bg_T_non_suff = pd.DataFrame()
    
    # Apply KNN Imputer 
    if sufficient_data_df.empty:
        print("No columns with sufficient data. Skipping imputation for sufficient data.")
    else:
        knn_sufficient_imputer = KNNImputer(n_neighbors=sufficient_kNeighbors)
        sufficient_imputed_columns = knn_sufficient_imputer.fit_transform(sufficient_data_df)
        # Convert the imputed data back to a DataFrame (sufficient)
        imputed_bg_T_suff = pd.DataFrame(
            sufficient_imputed_columns, 
            columns=sufficient_data_df.columns, 
            index=sufficient_data_df.index
        ) 

    if non_sufficient_data_df.empty:
        print("No columns with non-sufficient data. Skipping imputation for non-sufficient data.")
    else:
        knn_non_sufficient_imputer = KNNImputer(n_neighbors=non_sufficient_kNeighbors)
        non_sufficient_imputed_columns = knn_non_sufficient_imputer.fit_transform(non_sufficient_data_df)
        # Convert the imputed data back to a DataFrame (non-sufficient)
        imputed_bg_T_non_suff = pd.DataFrame(
            non_sufficient_imputed_columns, 
            columns=non_sufficient_data_df.columns, 
            index=non_sufficient_data_df.index
        ) 

    # Merge the two DataFrames along columns 
    merged_imputed_bg_T = pd.concat([imputed_bg_T_suff, imputed_bg_T_non_suff], axis=1) 

    # Ensure columns are in the original order 
    merged_imputed_bg_T = merged_imputed_bg_T[imputed_bg_T.columns] 

    # transpose the columns to prevent merging error
    if empty_columns.any(): 
        print("Transpose back")
        imputed_bg_T = imputed_bg_T.T
        
    # return the original row orientation 
    imputed_bg_final = merged_imputed_bg_T.T
    
    return imputed_bg_final 


In [559]:
def fill_original_dataframe(original_dataframe, imputed_dataframe, selected_columns): 
    # Ensure the selected columns exist in both DataFrames
    common_columns = [col for col in selected_columns if col in original_dataframe.columns and col in imputed_dataframe.columns]

    # Fill the imputed data back into the original DataFrame 
    original_dataframe[common_columns] = imputed_dataframe[common_columns]  

    return original_dataframe 
    

##  Fill in Blood Glucose and Insulin Using KNN Imputer

####    - blood glucose: bg
####    - insulin: insulin


In [561]:
sufficient_kNeighbors = 3 
non_sufficient_kNeighbors = 5 

In [562]:
def data_imputed(original_dataframe, selected_columns_key, sufficient_kNeighbors, non_sufficient_kNeighbors):
    # Transpose the data 
    selected_columns, imputed_dataframe_T = transpose(original_dataframe, selected_columns_key)
    
    # Imputation here using KNN 
    imputed_dataframe = imputation(imputed_dataframe_T, sufficient_kNeighbors, non_sufficient_kNeighbors) 

    original_dataframe = fill_original_dataframe(original_dataframe, imputed_dataframe, selected_columns) 

    return original_dataframe 

In [563]:
# first_dataframe = data_imputed(first_dataframe, "bg", sufficient_kNeighbors, non_sufficient_kNeighbors)
# first_dataframe = data_imputed(first_dataframe, "insulin", sufficient_kNeighbors, non_sufficient_kNeighbors)
# print(first_dataframe.shape)

In [564]:
# Check if the specific columns are filled 
def is_filled(dataframe, columns_key):
    if(columns_key == "bg"):
        columns = [col for col in dataframe.columns if columns_key in col][:-1] 
    else:
        columns = [col for col in dataframe.columns if columns_key in col]
        
    are_columns_filled = dataframe[columns].notna().all() 
    
    all_columns_filled = are_columns_filled.all()
    
    return all_columns_filled

In [565]:
# print(f'Blood Glucose are all filled: {is_filled(first_dataframe, "bg")}')
# print(f'Insulin are all filled: {is_filled(first_dataframe, "insulin")}')

## Fill in Steps (steps)

### If there are missing values, replace with zeroes 


In [567]:
def fill_in_steps(dataframe): 
    # Specify the columns to fill
    columns = [col for col in dataframe.columns if "steps" in col]
    
    # Fill Nan values in specified columns with zeroes
    dataframe[columns] = dataframe[columns].fillna(0)

In [568]:
# fill_in_steps(first_dataframe)
# print(f'Steps are all filled: {is_filled(first_dataframe, "steps")}')
# print(f'Carbohydrates are all filled: {is_filled(first_dataframe, "carbs")}')

In [569]:
# # Select columns with "steps" in their names
# columns = [col for col in first_dataframe.columns if "steps" in col]

# # Check if any columns were found
# if columns:
#     print(first_dataframe[columns])
# else:
#     print("No columns containing 'steps' found.")


## Fill in Carbohydrate
#### if there is a missing value, replace with zeroes 
####    - carbohydrate: carbs


In [571]:
def fill_in_carbs(dataframe): 
    # Specify the columns to fill
    columns = [col for col in dataframe.columns if "carbs" in col]
    
    # Fill Nan values in specified columns with zeroes
    dataframe[columns] = dataframe[columns].fillna(0)

In [572]:
# fill_in_carbs(first_dataframe)
# print(f'Carbohydrates are all filled: {is_filled(first_dataframe, "carbs")}')

In [573]:
# # Select columns with "steps" in their names
# columns = [col for col in first_dataframe.columns if "carbs" in col]

# # Check if any columns were found
# if columns:
#     print(first_dataframe[columns].tail())
# else:
#     print("No columns containing 'carbs' found.")


In [574]:
# # Save the updated DataFrame
# first_dataframe.to_csv("first_dataframe_with_filled_carbs.csv", index=False)
# print("Updated DataFrame saved to 'first_dataframe_with_filled_carbs.csv'")

In [575]:
# # Select columns with "carbs" in their names
# columns = [col for col in first_dataframe.columns if "carbs" in col]

# # Check if any columns were found
# if columns:
#     print(first_dataframe[columns])
# else:
#     print("No columns containing 'carbs' found.")

In [576]:
# print(f'Carbs are all filled: {is_filled(first_dataframe, "carbs")}')

## Fill in Heart Rate Parameter using Blood Glucose and Steps 
#### - heart rate: hr

In [578]:
def fill_in_heart_rate(dataframe, dataframe_name, metric_saving_path):
    bg_columns, selected_bg_col = specific_col_dataframe(dataframe, "bg")
    steps_columns, selected_steps_col = specific_col_dataframe(dataframe, "steps") 
    hr_columns, selected_hr_col = specific_col_dataframe(dataframe, "hr") 

    # Check if all 'hr_columns' are filled 
    if hr_columns is not None and not dataframe[hr_columns].isnull().values.any(): 
        print("Heart Rate columns are fully filled.") 
        return 
    
    # Combine predictors 
    if not selected_bg_col.empty and not selected_steps_col.empty:  # Ensure the DataFrames are not empty
        X = pd.concat([selected_bg_col, selected_steps_col], axis=1)    

    # list to store results 
    mse_results = [] 
    
    # Loop through each column in carbs_columns
    for hr_col in hr_columns:
        print(f"Processing column: {hr_col}")

        # Identify rows where the current column has missing values
        missing_indices = dataframe[dataframe[hr_col].isnull()].index

        # Check if the column has non-missing data
        if not dataframe[hr_col].isnull().all():
            # Create the target column for the current carb_col
            y = dataframe[hr_col]

        # Combine predictors (e.g., blood glucose and insulin)
        X_with_target = X[~y.isnull()]  # Rows where target (carb_col) is available
        y_with_target = y[~y.isnull()]  # Non-missing target values
        X_missing_target = X.loc[missing_indices]  # Rows where carb_col is missing

        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X_with_target, y_with_target, test_size=0.2, random_state=42)

        # Train an XGBoost model for the current heart rate column
        xgb_model = XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=10,
            random_state=42
        )
        xgb_model.fit(X_train, y_train)

        # Evaluate the model on the test set
        y_pred_test = xgb_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred_test)
        mse_results.append({'Participant': dataframe_name, 'Column': hr_col, 'MSE':mse}) 
        
        # print(f"Mean Squared Error for {hr_col}: {mse}")

        # Predict missing values for the current column
        predicted_missing_values = xgb_model.predict(X_missing_target)

        # Fill the missing values in the DataFrame
        dataframe.loc[missing_indices, hr_col] = predicted_missing_values
        print(f"Filled missing values for {hr_col}")
    else:
        print(f"Column {hr_col} is entirely empty and cannot be predicted.")
        # Optional: Handle entirely empty columns, e.g., impute using mean or median
        dataframe[hr_col] = dataframe[hr_columns].mean(axis=1)
        print(f"Filled {hr_col} with mean of other heart rate columns.")

    # Convert mse results to a DataFrame
    mse_results_df = pd.DataFrame(mse_results) 

    # Save to CSV 
    mse_results_df.to_csv(f'{metric_saving_path}/{dataframe_name}_hr_mse.csv', index=False) 

    print("MSE results have been saved.")
    
    # Confirm that all missing values are filled
    print(f"Missing values after filling:\n{dataframe[hr_columns].isnull().sum()}")

In [579]:
# fill_in_heart_rate(first_dataframe, 'p01')

In [580]:
# # Save the updated DataFrame
# first_dataframe.to_csv("first_dataframe_with_filled_hr.csv", index=False)
# print("Updated DataFrame saved to 'first_dataframe_with_filled_hr.csv'")

## Fill in Activity Parameters 
#### - Encode the activity using LabelEncoder 
#### - Use blood glucose, steps, and heart rate data to predict activity missing data

In [582]:
# # Prevent re-training heart rate dataset
# file_path = 'first_dataframe_with_filled_hr.csv' 
# first_dataframe = pd.read_csv(file_path) 

In [583]:
# Define the activity list
all_activities = [
    "Indoor climbing", "Run", "Strength training", "Swim", "Bike",
    "Dancing", "Stairclimber", "Spinning", "Walking", "HIIT",
    "Outdoor Bike", "Walk", "Aerobic Workout", "Tennis", "Workout",
    "Hike", "Zumba", "Sport", "Yoga", "Swimming", "Weights", "Running"
]

# Initialize a global LabelEncoder for all activities 
activity_encoder = LabelEncoder() 
activity_encoder.fit(all_activities) 

In [584]:
def replace_numeric_with_activities(dataframe):
    activity_columns, selected_activity_col = specific_col_dataframe(dataframe, "activity")     
    # Loop through each activity column in the DataFrame
    for activity_col in activity_columns:  # `activity_columns` is a list of activity column names
        print(f"Processing column: {activity_col}")
    
        # Check if the column exists in the DataFrame
        if activity_col in dataframe.columns:
            # Iterate through each cell in the column
            for index, value in dataframe[activity_col].items():
                if pd.notnull(value) and isinstance(value, (int, float)):
                    numeric_index = int(value)
                    if numeric_index < len(all_activities):
                        # Replace numeric value with corresponding activity name
                        dataframe.at[index, activity_col] = all_activities[numeric_index]

In [609]:
def fill_in_activities(dataframe, dataframe_name, metric_saving_path):
    
    bg_columns, selected_bg_col = specific_col_dataframe(dataframe, "bg")
    steps_columns, selected_steps_col = specific_col_dataframe(dataframe, "steps") 
    hr_columns, selected_hr_col = specific_col_dataframe(dataframe, "hr") 
    activity_columns, selected_activity_col = specific_col_dataframe(dataframe, "activity") 

     # Check if all 'activity_columns' are filled 
    if activity_columns is not None and not dataframe[activity_columns].isnull().values.any(): 
        print("Activity columns are fully filled.") 
        return 
        
    # Combine predictors 
    if not selected_bg_col.empty and not selected_steps_col.empty and not selected_hr_col.empty:  # Ensure the DataFrames are not empty
        X = pd.concat([selected_bg_col, selected_steps_col, selected_hr_col], axis=1) 

    # list to store results 
    accuracy_results = [] 
    
    # Loop through each activity column in the DataFrame
    for activity_col in activity_columns:  # `activity_columns` is a list of activity column names
        print(f"Processing column: {activity_col}")
                
        # Check if the column has non-missing data
        if not dataframe[activity_col].isnull().all():
            
            # Encode the activity column using the global LabelEncoder
            dataframe[activity_col] = dataframe[activity_col].apply(
                lambda x: activity_encoder.transform([x])[0] if pd.notnull(x) else None
            )
    
            # Separate rows with missing and non-missing activities
            training_data = dataframe[~dataframe[activity_col].isnull()]
            prediction_data = dataframe[dataframe[activity_col].isnull()] 
    
            # Define feature columns
            feature_columns = X.columns

            # Create training features (X) and target (y)
            X_train = training_data[feature_columns]
            y_train = training_data[activity_col]
    
            # Create prediction features
            X_predict = prediction_data[feature_columns]

            # Check if there are enough samples to split
            if len(X_train) > 1: 
                # Split data into training and test sets for evaluation
                X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
                    X_train, y_train, test_size=0.2, random_state=42
                )
            else: 
                # Skip splitting if not enough samples
                X_train_split, X_test_split = X_train, X_train
                y_train_split, y_test_split = y_train, y_train

            # Ensure all classes are present in training data
            all_classes = set(activity_encoder.transform(activity_encoder.classes_))
            missing_classes = all_classes - set(y_train_split)
            if missing_classes:
                print(f"Adding missing classes: {missing_classes}")
                for missing_class in missing_classes:
                    X_train_split = pd.concat([X_train_split, pd.DataFrame([X_train_split.mean()], columns=X_train_split.columns)])
                    y_train_split = pd.concat([y_train_split, pd.Series([missing_class])])
    
            # Initialize and train the XGBoost model
            xgb_model = XGBClassifier(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=10,
                random_state=42
            )
            xgb_model.fit(X_train_split, y_train_split)

            # Evaluate the model
            y_pred_test = xgb_model.predict(X_test_split)
            accuracy = accuracy_score(y_test_split, y_pred_test)
            
            accuracy_results.append({'Participant': dataframe_name, 'Column': activity_col, 'Accuracy Score': accuracy}) 
            # print(f"Model Accuracy for {activity_col}: {accuracy}")
    
            # Predict missing values
            if not X_predict.empty:
                predicted_activities = xgb_model.predict(X_predict)
    
                # Decode the predicted activities back to their original labels
                predicted_labels = activity_encoder.inverse_transform(predicted_activities)
                # Fill the missing values in the DataFrame
                dataframe.loc[dataframe[activity_col].isnull(), activity_col] = predicted_labels
                # Convert all numeric values in the column back to activity names
                dataframe[activity_col] = dataframe[activity_col].apply(
                    lambda x: activity_encoder.inverse_transform([int(x)])[0] if pd.notnull(x) and str(x).isdigit() else x
                )
                print(f"Filled missing values for {activity_col}.")
            
        else:
            print(f"Column {activity_col} is entirely empty and cannot be processed.")

    replace_numeric_with_activities(dataframe) 

    # Convert accuracy results to a DataFrame
    accuracy_results_df = pd.DataFrame(accuracy_results) 

    # Save to CSV 
    accuracy_results_df.to_csv(f'{metric_saving_path}/{dataframe_name}_activity_accuracy.csv', index=False) 

    print("Accuracy results have been saved.")


In [586]:
# fill_in_activities(first_dataframe, 'p01')

In [587]:
# # Save the updated DataFrame
# first_dataframe.to_csv("dataframe_with_predicted_activities.csv", index=False)
# print("Updated DataFrame saved to 'dataframe_with_predicted_activities.csv'")

In [588]:
# # Specify the column names
# selected_columns = ['bg-5:35', 'bg-5:40', 'steps-5:35', 'steps-5:40', 'hr-5:35', 'hr-5:40']

# # Select the columns
# selected_df = first_dataframe[selected_columns]

# # Display the selected columns
# print(selected_df)


### Fill in Calories using Steps, heart rates, and Activity 
####    - calories burnt: cals
 

In [590]:
# # Prevent re-training activity dataset
# file_path = 'dataframe_with_predicted_activities.csv' 
# first_dataframe = pd.read_csv(file_path) 

In [591]:
def fill_in_calories(dataframe, dataframe_name, metric_saving_path):
    activity_columns, selected_activity_col = specific_col_dataframe(dataframe, "activity") 
    steps_columns, selected_steps_col = specific_col_dataframe(dataframe, "steps") 
    hr_columns, selected_hr_col = specific_col_dataframe(dataframe, "hr") 
    cals_columns, selected_cals_col = specific_col_dataframe(dataframe, "cals") 

     # Check if all 'cals_columns' are filled 
    if cals_columns is not None and not dataframe[cals_columns].isnull().values.any(): 
        print("Calories columns are fully filled.") 
        return 
        
    # Combine predictors 
    if not selected_activity_col.empty and not selected_steps_col.empty and not selected_hr_col.empty:  # Ensure the DataFrames are not empty
        X = pd.concat([selected_activity_col, selected_steps_col, selected_hr_col], axis=1)    

    # list to store results 
    mse_results = [] 
    
    # Encode the activity columns 
    if not selected_activity_col.empty: 
        for activity_col in activity_columns: 
            print(f'Encoding column in X: {activity_col}')
            if activity_col in X.columns: 
                X[activity_col] = X[activity_col].apply(
                    lambda x: activity_encoder.transform([x])[0] if pd.notnull(x) and x in activity_encoder.classes_ else x
                )
    
    # Loop through each column in carbs_columns
    for cals_col in cals_columns:
        print(f"Processing column: {cals_col}")

        # Identify rows where the current column has missing values
        missing_indices = dataframe[dataframe[cals_col].isnull()].index

        # Check if the column has non-missing data
        if not dataframe[cals_col].isnull().all():
            # Create the target column for the current carb_col
            y = dataframe[cals_col]

        # Combine predictors (e.g., blood glucose and insulin)
        X_with_target = X[~y.isnull()]  # Rows where target (carb_col) is available
        y_with_target = y[~y.isnull()]  # Non-missing target values
        X_missing_target = X.loc[missing_indices]  # Rows where carb_col is missing

        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X_with_target, y_with_target, test_size=0.2, random_state=42)

        # Train an XGBoost model for the current heart rate column
        xgb_model = XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=10,
            random_state=42
        )
        xgb_model.fit(X_train, y_train)

        # Evaluate the model on the test set
        y_pred_test = xgb_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred_test)
        mse_results.append({'Participant': dataframe_name, 'Column': cals_col, 'MSE':mse})  
        # print(f"Mean Squared Error for {cals_col}: {mse}")

        # Predict missing values for the current column
        predicted_missing_values = xgb_model.predict(X_missing_target)

        # Fill the missing values in the DataFrame
        dataframe.loc[missing_indices, cals_col] = predicted_missing_values
        print(f"Filled missing values for {cals_col}")
    else:
        print(f"Column {cals_col} is entirely empty and cannot be predicted.")
        # Optional: Handle entirely empty columns, e.g., impute using mean or median
        dataframe[cals_col] = dataframe[cals_columns].mean(axis=1)
        print(f"Filled {cals_col} with mean of other calories columns.")
    
    # Confirm that all missing values are filled
    print(f"Missing values after filling:\n{dataframe[cals_columns].isnull().sum()}")

    # Convert mse results to a DataFrame
    mse_results_df = pd.DataFrame(mse_results) 

    # Save to CSV 
    mse_results_df.to_csv(f'{metric_saving_path}/{dataframe_name}_cal_mse.csv', index=False) 

    print("MSE results have been saved.")

In [592]:
# fill_in_calories(first_dataframe, 'p01')

In [593]:
# # Save the updated DataFrame
# first_dataframe.to_csv("dataframe_with_predicted_activities.csv", index=False)
# print("Updated DataFrame saved to 'dataframe_with_predicted_activities.csv'")

## Data Imputation
### Impute every dataframe 

In [595]:
def Fill_In_Missing_Values(dataframe, dataframe_name, metric_saving_path, file_saving_path): 
    print(f"Data for {dataframe_name}")

    # Prevent Re-train the data 
    file_path = f'{file_saving_path}/{dataframe_name}.csv'

    if os.path.exists(file_path): 
        print(f"File found: {file_path}. Loading it ...") 
        dataframe = pd.read_csv(file_path) 
    
    # For Blood Glucose and Insulin using KNNImputer
    sufficient_kNeighbors = 3 
    non_sufficient_kNeighbors = 5 

    dataframe = data_imputed(dataframe, "bg", sufficient_kNeighbors, non_sufficient_kNeighbors)
    print(f'Blood Glucose are all filled: {is_filled(dataframe, "bg")}')
    
    dataframe = data_imputed(dataframe, "insulin", sufficient_kNeighbors, non_sufficient_kNeighbors)
    print(f'Insulin are all filled: {is_filled(dataframe, "insulin")}')
    
    # Save the updated DataFrame
    dataframe.to_csv(f'{file_saving_path}/{dataframe_name}.csv', index=False)
    print(f'Updated DataFrame saved to {file_saving_path}/{dataframe_name}.csv')
    
    # For Steps
    fill_in_steps(dataframe)
    print(f'Steps are all filled: {is_filled(dataframe, "steps")}')

    # Save the updated DataFrame
    dataframe.to_csv(f'{file_saving_path}/{dataframe_name}.csv', index=False)
    print(f'Updated DataFrame saved to {file_saving_path}/{dataframe_name}.csv')
    
    # For Carbohydrate
    fill_in_carbs(dataframe)
    print(f'Carbohydrates are all filled: {is_filled(dataframe, "carbs")}')

    # Save the updated DataFrame
    dataframe.to_csv(f'{file_saving_path}/{dataframe_name}.csv', index=False)
    print(f'Updated DataFrame saved to {file_saving_path}/{dataframe_name}.csv') 

    # For Heart Rate
    fill_in_heart_rate(dataframe, dataframe_name, metric_saving_path)
    print(f'Heart Rate are all filled: {is_filled(dataframe, "hr")}')

    # Save the updated DataFrame
    dataframe.to_csv(f'{file_saving_path}/{dataframe_name}.csv', index=False)
    print(f'Updated DataFrame saved to {file_saving_path}/{dataframe_name}.csv') 

    # For Activity
    fill_in_activities(dataframe, dataframe_name, metric_saving_path)
    print(f'Activity are all filled: {is_filled(dataframe, "activity")}')

    # Save the updated DataFrame
    dataframe.to_csv(f'{file_saving_path}/{dataframe_name}.csv', index=False)
    print(f'Updated DataFrame saved to {file_saving_path}/{dataframe_name}.csv') 

    # For Calories
    fill_in_calories(dataframe, dataframe_name, metric_saving_path)
    print(f'Calories are all filled: {is_filled(dataframe, "cals")}')

    # Save the updated DataFrame
    dataframe.to_csv(f'{file_saving_path}/{dataframe_name}.csv', index=False)
    print(f'Updated DataFrame saved to {file_saving_path}/{dataframe_name}.csv') 

In [596]:
# Path to the folder containing CSV files 
folder_path = 'train_batch' # copy out the csv file before change this file path to prevent overlapping of evaluation results

# List all CSV files in the folder 
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')] 

# Store each CSV into a dictionary by file name (without extension) 
dataframes = {os.path.splitext(file)[0]: pd.read_csv(os.path.join(folder_path, file)) for file in csv_files}

  dataframes = {os.path.splitext(file)[0]: pd.read_csv(os.path.join(folder_path, file)) for file in csv_files}
  dataframes = {os.path.splitext(file)[0]: pd.read_csv(os.path.join(folder_path, file)) for file in csv_files}
  dataframes = {os.path.splitext(file)[0]: pd.read_csv(os.path.join(folder_path, file)) for file in csv_files}
  dataframes = {os.path.splitext(file)[0]: pd.read_csv(os.path.join(folder_path, file)) for file in csv_files}
  dataframes = {os.path.splitext(file)[0]: pd.read_csv(os.path.join(folder_path, file)) for file in csv_files}
  dataframes = {os.path.splitext(file)[0]: pd.read_csv(os.path.join(folder_path, file)) for file in csv_files}


In [597]:
for name, df in dataframes.items():
    Fill_In_Missing_Values(df, name, "evaluation_results/train_eval", "train_batch_imputed") 

Data for p01
File found: train_batch_imputed/p01.csv. Loading it ...
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Blood Glucose are all filled: True
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Insulin are all filled: True
Updated DataFrame saved to train_batch_imputed/p01.csv
Steps are all filled: True
Updated DataFrame saved to train_batch_imputed/p01.csv
Carbohydrates are all filled: True
Updated DataFrame saved to train_batch_imputed/p01.csv
Heart Rate columns are fully filled.
Heart Rate are all filled: True
Updated DataFrame saved to train_batch_imputed/p01.csv
Activity columns are fully filled.
Activity are all filled: True
Updated DataFrame saved to train_batch_imputed/p01.csv
Calories columns are fully filled.
Calories are all filled: True
Updated DataFrame saved to train_batch_imputed/p01.csv
Data for p02
File found: train_batch_imputed/p02.csv. Loading it ...
No columns with non-sufficient data. Skippi

In [598]:
# Access individual DataFrames 
_, dataframe_11 = list(dataframes.items())[7] 
Fill_In_Missing_Values(dataframe_11, "p11", "evaluation_results/train_eval", "train_batch_imputed") 

Data for p11
File found: train_batch_imputed/p11.csv. Loading it ...
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Blood Glucose are all filled: True
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Insulin are all filled: True
Updated DataFrame saved to train_batch_imputed/p11.csv
Steps are all filled: True
Updated DataFrame saved to train_batch_imputed/p11.csv
Carbohydrates are all filled: True
Updated DataFrame saved to train_batch_imputed/p11.csv
Heart Rate columns are fully filled.
Heart Rate are all filled: True
Updated DataFrame saved to train_batch_imputed/p11.csv
Activity columns are fully filled.
Activity are all filled: True
Updated DataFrame saved to train_batch_imputed/p11.csv
Calories columns are fully filled.
Calories are all filled: True
Updated DataFrame saved to train_batch_imputed/p11.csv


In [599]:
# Prevent Re-train the data 
file_path = f'train_batch_imputed/p11.csv'

if os.path.exists(file_path): 
    print(f"File found: {file_path}. Loading it ...") 
    dataframe = pd.read_csv(file_path) 
else:
    _, dataframe = list(dataframes.items())[7] 

# Fill in the remaining missing value of insulin with zeros for p11 
columns = [col for col in dataframe.columns if "insulin" in col]

# Fill Nan values in specified columns with zeroes
dataframe[columns] = dataframe[columns].fillna(0)

# Save the updated DataFrame
dataframe.to_csv(f'train_batch_imputed/p11.csv', index=False)
print(f'Updated DataFrame saved to train_batch_imputed/p11.csv') 

File found: train_batch_imputed/p11.csv. Loading it ...
Updated DataFrame saved to train_batch_imputed/p11.csv


In [600]:
_, dataframe_12 = list(dataframes.items())[8] 
Fill_In_Missing_Values(dataframe_12, "p12", "evaluation_results/train_eval", "train_batch_imputed") 

Data for p12
File found: train_batch_imputed/p12.csv. Loading it ...
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Blood Glucose are all filled: True
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Insulin are all filled: True
Updated DataFrame saved to train_batch_imputed/p12.csv
Steps are all filled: True
Updated DataFrame saved to train_batch_imputed/p12.csv
Carbohydrates are all filled: True
Updated DataFrame saved to train_batch_imputed/p12.csv
Heart Rate columns are fully filled.
Heart Rate are all filled: True
Updated DataFrame saved to train_batch_imputed/p12.csv
Activity columns are fully filled.
Activity are all filled: True
Updated DataFrame saved to train_batch_imputed/p12.csv
Calories columns are fully filled.
Calories are all filled: True
Updated DataFrame saved to train_batch_imputed/p12.csv


In [666]:
# Copy the evaluation results from the folders for train batch before running test batch
# Path to the folder containing CSV files 
folder_path = 'test_batch' # copy out the csv file before change this file path to prevent overlapping of evaluation results

# List all CSV files in the folder 
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')] 

# Store each CSV into a dictionary by file name (without extension) 
dataframes_test = {os.path.splitext(file)[0]: pd.read_csv(os.path.join(folder_path, file)) for file in csv_files}

In [670]:
# Access individual DataFrames 
for name, df in dataframes_test.items():
    Fill_In_Missing_Values(df, name, "evaluation_results/test_eval", "test_batch_imputed") 

Data for p01
File found: test_batch_imputed/p01.csv. Loading it ...
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Blood Glucose are all filled: True
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Insulin are all filled: True
Updated DataFrame saved to test_batch_imputed/p01.csv
Steps are all filled: True
Updated DataFrame saved to test_batch_imputed/p01.csv
Carbohydrates are all filled: True
Updated DataFrame saved to test_batch_imputed/p01.csv
Heart Rate columns are fully filled.
Heart Rate are all filled: True
Updated DataFrame saved to test_batch_imputed/p01.csv
Activity columns are fully filled.
Activity are all filled: True
Updated DataFrame saved to test_batch_imputed/p01.csv
Calories columns are fully filled.
Calories are all filled: True
Updated DataFrame saved to test_batch_imputed/p01.csv
Data for p02
File found: test_batch_imputed/p02.csv. Loading it ...
No columns with non-sufficient data. Skipping imput

In [624]:
_, dataframe_16 = list(dataframes.items())[9] 
Fill_In_Missing_Values(dataframe_16, "p16", "evaluation_results/test_eval", "test_batch_imputed") 

Data for p16
File found: test_batch_imputed/p16.csv. Loading it ...
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Blood Glucose are all filled: True
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Insulin are all filled: True
Updated DataFrame saved to test_batch_imputed/p16.csv
Steps are all filled: True
Updated DataFrame saved to test_batch_imputed/p16.csv
Carbohydrates are all filled: True
Updated DataFrame saved to test_batch_imputed/p16.csv
Heart Rate columns are fully filled.
Heart Rate are all filled: True
Updated DataFrame saved to test_batch_imputed/p16.csv
Processing column: activity-5:55
Adding missing classes: {0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21}
Processing column: activity-5:50
Adding missing classes: {0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21}
Processing column: activity-5:45
Adding missing classes: {0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13

In [642]:
_, dataframe_18 = list(dataframes.items())[10] 
Fill_In_Missing_Values(dataframe_18, "p18", "evaluation_results/test_eval", "test_batch_imputed") 

Data for p18
File found: test_batch_imputed/p18.csv. Loading it ...
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Blood Glucose are all filled: True
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Insulin are all filled: True
Updated DataFrame saved to test_batch_imputed/p18.csv
Steps are all filled: True
Updated DataFrame saved to test_batch_imputed/p18.csv
Carbohydrates are all filled: True
Updated DataFrame saved to test_batch_imputed/p18.csv
Heart Rate columns are fully filled.
Heart Rate are all filled: True
Updated DataFrame saved to test_batch_imputed/p18.csv
Processing column: activity-5:55
Adding missing classes: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21}
Filled missing values for activity-5:55.
Processing column: activity-5:50
Adding missing classes: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21}
Filled missing values for activity-5:50.
Processing 

In [644]:
_, dataframe_19 = list(dataframes.items())[11] 
Fill_In_Missing_Values(dataframe_19, "p19", "evaluation_results/test_eval", "test_batch_imputed") 

Data for p19
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Blood Glucose are all filled: True
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Insulin are all filled: True
Updated DataFrame saved to test_batch_imputed/p19.csv
Steps are all filled: True
Updated DataFrame saved to test_batch_imputed/p19.csv
Carbohydrates are all filled: True
Updated DataFrame saved to test_batch_imputed/p19.csv
Processing column: hr-5:55
Filled missing values for hr-5:55
Processing column: hr-5:50
Filled missing values for hr-5:50
Processing column: hr-5:45
Filled missing values for hr-5:45
Processing column: hr-5:40
Filled missing values for hr-5:40
Processing column: hr-5:35
Filled missing values for hr-5:35
Processing column: hr-5:30
Filled missing values for hr-5:30
Processing column: hr-5:25
Filled missing values for hr-5:25
Processing column: hr-5:20
Filled missing values for hr-5:20
Processing column: hr-5:15
Filled missing value

In [646]:
_, dataframe_21 = list(dataframes.items())[12] 
Fill_In_Missing_Values(dataframe_21, "p21", "evaluation_results/test_eval", "test_batch_imputed") 

Data for p21
Blood Glucose are all filled: True
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Transpose back
Insulin are all filled: False
Updated DataFrame saved to test_batch_imputed/p21.csv
Steps are all filled: True
Updated DataFrame saved to test_batch_imputed/p21.csv
Carbohydrates are all filled: True
Updated DataFrame saved to test_batch_imputed/p21.csv
Processing column: hr-5:55
Filled missing values for hr-5:55
Processing column: hr-5:50
Filled missing values for hr-5:50
Processing column: hr-5:45
Filled missing values for hr-5:45
Processing column: hr-5:40
Filled missing values for hr-5:40
Processing column: hr-5:35
Filled missing values for hr-5:35
Processing column: hr-5:30
Filled missing values for hr-5:30
Processing column: hr-5:25
Filled missing values for hr-5:25
Processing column: hr-5:20
Filled missing values for hr-5:20
Processing column: hr-5:15
Filled missing values for hr-5:15
Processing column: hr-5:10
Filled missing values for

In [648]:
_, dataframe_22 = list(dataframes.items())[13] 
Fill_In_Missing_Values(dataframe_22, "p22", "evaluation_results/test_eval", "test_batch_imputed") 

Data for p22
Blood Glucose are all filled: True
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Insulin are all filled: True
Updated DataFrame saved to test_batch_imputed/p22.csv
Steps are all filled: True
Updated DataFrame saved to test_batch_imputed/p22.csv
Carbohydrates are all filled: True
Updated DataFrame saved to test_batch_imputed/p22.csv
Processing column: hr-5:55
Filled missing values for hr-5:55
Processing column: hr-5:50
Filled missing values for hr-5:50
Processing column: hr-5:45
Filled missing values for hr-5:45
Processing column: hr-5:40
Filled missing values for hr-5:40
Processing column: hr-5:35
Filled missing values for hr-5:35
Processing column: hr-5:30
Filled missing values for hr-5:30
Processing column: hr-5:25
Filled missing values for hr-5:25
Processing column: hr-5:20
Filled missing values for hr-5:20
Processing column: hr-5:15
Filled missing values for hr-5:15
Processing column: hr-5:10
Filled missing values for hr-5:10
Process

In [649]:
_, dataframe_24 = list(dataframes.items())[14] 
Fill_In_Missing_Values(dataframe_24, "p24", "evaluation_results/test_eval", "test_batch_imputed") 

Data for p24
Blood Glucose are all filled: True
No columns with non-sufficient data. Skipping imputation for non-sufficient data.
Insulin are all filled: True
Updated DataFrame saved to test_batch_imputed/p24.csv
Steps are all filled: True
Updated DataFrame saved to test_batch_imputed/p24.csv
Carbohydrates are all filled: True
Updated DataFrame saved to test_batch_imputed/p24.csv
Processing column: hr-5:55
Filled missing values for hr-5:55
Processing column: hr-5:50
Filled missing values for hr-5:50
Processing column: hr-5:45
Filled missing values for hr-5:45
Processing column: hr-5:40
Filled missing values for hr-5:40
Processing column: hr-5:35
Filled missing values for hr-5:35
Processing column: hr-5:30
Filled missing values for hr-5:30
Processing column: hr-5:25
Filled missing values for hr-5:25
Processing column: hr-5:20
Filled missing values for hr-5:20
Processing column: hr-5:15
Filled missing values for hr-5:15
Processing column: hr-5:10
Filled missing values for hr-5:10
Process

In [676]:
# Copy the evaluation results from the folders for train batch before running test batch
# Path to the folder containing CSV files 
folder_path = 'test_batch_imputed' # copy out the csv file before change this file path to prevent overlapping of evaluation results

# List all CSV files in the folder 
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')] 

# Store each CSV into a dictionary by file name (without extension) 
dataframes_test_filled = {os.path.splitext(file)[0]: pd.read_csv(os.path.join(folder_path, file)) for file in csv_files}

# Access individual DataFrames 
for name, df in dataframes_test_filled.items():
    print(f'{name}, Blood Glucose are all filled: {is_filled(dataframe, "bg")}')
    print(f'{name}, Insulin are all filled: {is_filled(dataframe, "insulin")}')
    print(f'{name}, Steps are all filled: {is_filled(dataframe, "steps")}')
    print(f'{name}, Heart Rate are all filled: {is_filled(dataframe, "hr")}')
    print(f'{name}, Carbs are all filled: {is_filled(dataframe, "carbs")}')
    print(f'{name}, Cals are all filled: {is_filled(dataframe, "cals")}')
    print(f'{name}, Activity are all filled: {is_filled(dataframe, "activity")}') 

p01, Blood Glucose are all filled: True
p01, Insulin are all filled: True
p01, Steps are all filled: True
p01, Heart Rate are all filled: True
p01, Carbs are all filled: True
p01, Cals are all filled: True
p01, Activity are all filled: True
p02, Blood Glucose are all filled: True
p02, Insulin are all filled: True
p02, Steps are all filled: True
p02, Heart Rate are all filled: True
p02, Carbs are all filled: True
p02, Cals are all filled: True
p02, Activity are all filled: True
p04, Blood Glucose are all filled: True
p04, Insulin are all filled: True
p04, Steps are all filled: True
p04, Heart Rate are all filled: True
p04, Carbs are all filled: True
p04, Cals are all filled: True
p04, Activity are all filled: True
p05, Blood Glucose are all filled: True
p05, Insulin are all filled: True
p05, Steps are all filled: True
p05, Heart Rate are all filled: True
p05, Carbs are all filled: True
p05, Cals are all filled: True
p05, Activity are all filled: True
p06, Blood Glucose are all filled: T