## Pre-processing the training data

In [176]:
import os 
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib

### Combine all the training data

In [179]:
# Path to directory 
directory_path = 'train_batch_imputed' 
output_file = 'combined_train_batch_imputed.csv' 

if os.path.exists(output_file):
    print(f"'{output_file}' already exists. Skipping data combination.")

else: 
    # List to store individual DataFrames 
    dataframes = [] 
    
    # Iterate thru all the files in the directory 
    for file_name in os.listdir(directory_path): 
        if file_name.endswith('.csv'): 
            file_path = os.path.join(directory_path, file_name) 
            # Read the csv file 
            df = pd.read_csv(file_path) 
            dataframes.append(df) 
    
    # Concatenate all DataFrames into one 
    combined_data = pd.concat(dataframes, ignore_index=True) 
    
    # Display the shape of the combined data 
    print("Combined Data Shape:", combined_data.shape) 
    
    # Save combined data for future use 
    combined_data.to_csv('combined_train_batch_imputed.csv', index=False) 


'combined_train_batch_imputed.csv' already exists. Skipping data combination.


### Retrieve the combined data 

In [182]:
file_path = 'combined_train_batch_imputed.csv' 

dataset = pd.read_csv(file_path) 

print(dataset.shape)
dataset.head()

(177024, 508)


Unnamed: 0,id,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
0,p01_0,p01,06:10:00,8.666667,9.2,9.6,9.666667,9.166667,9.7,10.833333,...,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,13.4
1,p01_1,p01,06:25:00,9.666667,8.766667,9.7,10.833333,8.933333,9.2,11.9,...,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,12.8
2,p01_2,p01,06:40:00,10.833333,8.4,9.2,11.9,8.566667,8.7,12.833333,...,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,15.5
3,p01_3,p01,06:55:00,11.9,8.266667,8.7,12.833333,8.366667,8.4,13.633333,...,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,14.8
4,p01_4,p01,07:10:00,12.833333,8.666667,8.4,13.633333,8.766667,8.1,14.1,...,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,Dancing,12.7


In [184]:
# Check for missing values and data types
missing_summary = dataset.isnull().sum().to_frame(name="Missing Count")
missing_summary["Data Type"] = dataset.dtypes
missing_summary = missing_summary[missing_summary["Missing Count"] > 0]

print(missing_summary)

Empty DataFrame
Columns: [Missing Count, Data Type]
Index: []


### Separate the Numeric and Categorical Columns 
    - Numeric Columns: float and int types
        - Ex: bg, insulin, steps, hr, carbs, cals
    - Categorical Columns: object  
        - Ex: id, p_num, time, activity

In [187]:
numeric_cols = dataset.select_dtypes(include=['float64', 'int64']).columns 
categorical_cols = dataset.select_dtypes(include=['object']).columns 

# Display numeric_cols and categorical_cols 
numeric_cols, categorical_cols

(Index(['bg-5:55', 'bg-5:50', 'bg-5:45', 'bg-5:40', 'bg-5:35', 'bg-5:30',
        'bg-5:25', 'bg-5:20', 'bg-5:15', 'bg-5:10',
        ...
        'cals-0:40', 'cals-0:35', 'cals-0:30', 'cals-0:25', 'cals-0:20',
        'cals-0:15', 'cals-0:10', 'cals-0:05', 'cals-0:00', 'bg+1:00'],
       dtype='object', length=433),
 Index(['id', 'p_num', 'time', 'activity-5:55', 'activity-5:50',
        'activity-5:45', 'activity-5:40', 'activity-5:35', 'activity-5:30',
        'activity-5:25', 'activity-5:20', 'activity-5:15', 'activity-5:10',
        'activity-5:05', 'activity-5:00', 'activity-4:55', 'activity-4:50',
        'activity-4:45', 'activity-4:40', 'activity-4:35', 'activity-4:30',
        'activity-4:25', 'activity-4:20', 'activity-4:15', 'activity-4:10',
        'activity-4:05', 'activity-4:00', 'activity-3:55', 'activity-3:50',
        'activity-3:45', 'activity-3:40', 'activity-3:35', 'activity-3:30',
        'activity-3:25', 'activity-3:20', 'activity-3:15', 'activity-3:10',
        

## Normalize the Numeric Columns 
### For Blood Glucose (bg), normalization should be separate between the input and output

In [190]:
from sklearn.preprocessing import MinMaxScaler 

In [192]:
# Initialize scalers for output 
output_scaler = MinMaxScaler() 
output_col = 'bg+1:00'

# Normalize output (bg+) 
output_scaler.fit(dataset[[output_col]]) 
# Save the fitted scaler to a file for evaluation prediction reverse scaler 
joblib.dump(output_scaler, 'training_bg_prediction_scaler.pkl')
print("Scaler saved to 'scaler.pkl'") 

print("Scaler Parameters:")
print("Min:", output_scaler.data_min_)
print("Max:", output_scaler.data_max_)
print("Scale:", output_scaler.scale_)

dataset[output_col] = output_scaler.transform(dataset[[output_col]]) 


# Verify Normalization 
output_summary = dataset[[output_col]].describe() 

# Display the summary
print("Output Columns Summary:")
print(output_summary)

Scaler saved to 'scaler.pkl'
Scaler Parameters:
Min: [2.2]
Max: [27.8]
Scale: [0.0390625]
Output Columns Summary:
             bg+1:00
count  177024.000000
mean        0.237385
std         0.117047
min         0.000000
25%         0.152344
50%         0.214844
75%         0.300781
max         1.000000


In [140]:
# Initialize scalers for input 
input_scaler = MinMaxScaler() 

# Group each numeric columns  
bg_cols = [col for col in numeric_cols if col.startswith('bg-')] 
insulin_cols = [col for col in numeric_cols if col.startswith('insulin-')] 
carbs_cols = [col for col in numeric_cols if col.startswith('carbs-')] 
hr_cols = [col for col in numeric_cols if col.startswith('hr-')] 
steps_cols = [col for col in numeric_cols if col.startswith('steps-')] 
cals_cols = [col for col in numeric_cols if col.startswith('cals-')] 

# Normalize each group independently 
dataset[bg_cols] = input_scaler.fit_transform(dataset[bg_cols]) 
dataset[insulin_cols] = input_scaler.fit_transform(dataset[insulin_cols])
dataset[carbs_cols] = input_scaler.fit_transform(dataset[carbs_cols])
dataset[hr_cols] = input_scaler.fit_transform(dataset[hr_cols])
dataset[steps_cols] = input_scaler.fit_transform(dataset[steps_cols])
dataset[cals_cols] = input_scaler.fit_transform(dataset[cals_cols])

# Verify normalization by checking summary
bg_summary = dataset[bg_cols].describe() 
insulin_summary = dataset[insulin_cols].describe() 
carbs_summary = dataset[carbs_cols].describe() 
hr_summary = dataset[hr_cols].describe() 
steps_summary = dataset[steps_cols].describe() 
cals_summary = dataset[cals_cols].describe()

# Display the summaries
print("BG Columns Summary:")
print(bg_summary)
print("\nInsulin Columns Summary:")
print(insulin_summary)
# Display the summaries
print("\nCarbs Columns Summary:")
print(carbs_summary)
print("\nHR Columns Summary:")
print(hr_summary)
# Display the summaries
print("\nSteps Columns Summary:")
print(steps_summary)
print("\nCals Columns Summary:")
print(_summary)

BG Columns Summary:
             bg-5:55        bg-5:50        bg-5:45        bg-5:40  \
count  177024.000000  177024.000000  177024.000000  177024.000000   
mean        0.237504       0.237402       0.237461       0.237448   
std         0.116881       0.117091       0.116320       0.116619   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.152344       0.152344       0.152344       0.152344   
50%         0.214844       0.214844       0.214844       0.214844   
75%         0.300781       0.300781       0.300781       0.300781   
max         1.000000       1.000000       1.000000       1.000000   

             bg-5:35        bg-5:30        bg-5:25        bg-5:20  \
count  177024.000000  177024.000000  177024.000000  177024.000000   
mean        0.237362       0.237431       0.237385       0.237363   
std         0.116832       0.116654       0.116498       0.116827   
min         0.000000       0.000000       0.000000       0.000000   
25%         0

## Normalization Categorical Columns
### Encode Activity Columns

In [93]:
# Define the activity list
all_activities = [
    "Indoor climbing", "Run", "Strength training", "Swim", "Bike",
    "Dancing", "Stairclimber", "Spinning", "Walking", "HIIT",
    "Outdoor Bike", "Walk", "Aerobic Workout", "Tennis", "Workout",
    "Hike", "Zumba", "Sport", "Yoga", "Swimming", "Weights", "Running"
]

# Initialize a global LabelEncoder for all activities 
activity_encoder = LabelEncoder() 
activity_encoder.fit(all_activities) 

# Identify activity columns 
activity_cols = [col for col in dataset.columns if col.startswith("activity-")] 

# Encode all activity columns 
for col in activity_cols: 
    dataset[col] = activity_encoder.transform(dataset[col]) 

# Verify the encoding 
print(dataset[activity_cols].head())

   activity-5:55  activity-5:50  activity-5:45  activity-5:40  activity-5:35  \
0             18             18             18             18             18   
1             18             18             18             18             18   
2             18             18             18             18             18   
3             18             18             18             18             18   
4             18             18             18             18             18   

   activity-5:30  activity-5:25  activity-5:20  activity-5:15  activity-5:10  \
0             18             18             18             18             18   
1             18             18             18             18             18   
2             18             18             18             18             18   
3             18             18             18             18             18   
4             18             18             18             18             18   

   ...  activity-0:45  activity-0:40  

### Encode id, p_num, and time

In [95]:
# Encode p_num 
dataset['p_num'] = LabelEncoder().fit_transform(dataset['p_num'])  

# Drop id  
dataset = dataset.drop(columns=['id']) 

# Extract hour and minute from time (HH:MM:SS) 
dataset['hour'] = dataset['time'].str.split(':').str[0].astype(int) 
dataset['minute'] = dataset['time'].str.split(':').str[1].astype(int) 

# Use cyclical encoding for time to be more efficient 
dataset['hour_sin'] = np.sin(2 * np.pi * dataset['hour'] / 24) 
dataset['hour_cos'] = np.cos(2 * np.pi * dataset['hour'] / 24) 
dataset['minute_sin'] = np.sin(2 * np.pi * dataset['minute'] / 60)
dataset['minute_cos'] = np.cos(2 * np.pi * dataset['minute'] / 60) 

# Drop the original time column 
dataset = dataset.drop(columns=['time']) 

  dataset['hour'] = dataset['time'].str.split(':').str[0].astype(int)
  dataset['minute'] = dataset['time'].str.split(':').str[1].astype(int)
  dataset['hour_sin'] = np.sin(2 * np.pi * dataset['hour'] / 24)
  dataset['hour_cos'] = np.cos(2 * np.pi * dataset['hour'] / 24)
  dataset['minute_sin'] = np.sin(2 * np.pi * dataset['minute'] / 60)
  dataset['minute_cos'] = np.cos(2 * np.pi * dataset['minute'] / 60)


In [97]:
# Re-arrange the time columns to the middle of p_num and bg-5:55 

# List of columns to insert
time_columns = ['hour', 'minute', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos']

# Get all columns up to and including p_num 
columns_before = list(dataset.columns[:dataset.columns.get_loc('p_num') + 1]) 

# Get all columns starting from bg-5:55 
columns_after = list(dataset.columns[dataset.columns.get_loc('bg-5:55'):dataset.columns.get_loc('bg+1:00')+1]) 

# Combine into the new column order 
new_column_order = columns_before + time_columns + columns_after 

# Rearrange the Dataframe 
dataset = dataset[new_column_order] 

# Display the new column order
print(dataset.columns)

Index(['p_num', 'hour', 'minute', 'hour_sin', 'hour_cos', 'minute_sin',
       'minute_cos', 'bg-5:55', 'bg-5:50', 'bg-5:45',
       ...
       'activity-0:40', 'activity-0:35', 'activity-0:30', 'activity-0:25',
       'activity-0:20', 'activity-0:15', 'activity-0:10', 'activity-0:05',
       'activity-0:00', 'bg+1:00'],
      dtype='object', length=512)


In [109]:
# Save as a new csv file 
output_file = 'pre_process_dataset.csv'
dataset.to_csv(output_file, index=False) 