## Pre-processing the testing data

In [2]:
import os 
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [4]:
# Path to directory 
directory_path = 'test_batch_imputed' 
output_file = 'combined_test_batch_imputed.csv' 

if os.path.exists(output_file):
    print(f"'{output_file}' already exists. Skipping data combination.")

else: 
    # List to store individual DataFrames 
    dataframes = [] 
    
    # Iterate thru all the files in the directory 
    for file_name in os.listdir(directory_path): 
        if file_name.endswith('.csv'): 
            file_path = os.path.join(directory_path, file_name) 
            # Read the csv file 
            df = pd.read_csv(file_path) 
            dataframes.append(df) 
    
    # Concatenate all DataFrames into one 
    combined_data = pd.concat(dataframes, ignore_index=True) 
    
    # Display the shape of the combined data 
    print("Combined Data Shape:", combined_data.shape) 
    
    # Save combined data for future use 
    combined_data.to_csv('combined_test_batch_imputed.csv', index=False) 


Combined Data Shape: (3644, 508)


In [12]:
file_path = 'combined_test_batch_imputed.csv' 

dataset = pd.read_csv(file_path) 

print(dataset.shape)
dataset.head()

(3644, 508)


Unnamed: 0,id,p_num,time,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
0,p01_8459,p01,6:45:00,10.8,9.2,9.966667,11.066667,10.2,9.633333,11.066667,...,Dancing,Running,Dancing,Dancing,Dancing,Dancing,Running,Running,Dancing,
1,p01_8460,p01,11:25:00,7.217391,8.933333,9.9,7.217391,8.366667,9.4,7.217391,...,Dancing,Running,Dancing,Dancing,Dancing,Dancing,Walking,Walking,Walking,
2,p01_8461,p01,14:45:00,5.3,5.5,4.966667,5.565217,5.5,5.233333,5.565217,...,Dancing,Running,Dancing,Dancing,Dancing,Dancing,Running,Running,Dancing,
3,p01_8462,p01,4:30:00,4.266667,3.4,7.233333,4.6,3.9,4.966667,4.6,...,Dancing,Running,Dancing,Dancing,Dancing,Dancing,Running,Running,Dancing,
4,p01_8463,p01,4:20:00,7.895652,11.666667,8.3,7.895652,12.6,10.0,7.895652,...,Dancing,Running,Dancing,Dancing,Dancing,Dancing,Running,Running,Dancing,


In [14]:
# Check for missing values and data types
missing_summary = dataset.isnull().sum().to_frame(name="Missing Count")
missing_summary["Data Type"] = dataset.dtypes
missing_summary = missing_summary[missing_summary["Missing Count"] > 0]

print(missing_summary)

               Missing Count Data Type
bg-0:00                  132   float64
insulin-5:55             157   float64
insulin-5:50             157   float64
insulin-5:45             157   float64
insulin-5:40             157   float64
...                      ...       ...
activity-2:15            235    object
activity-1:30            288    object
activity-1:10            288    object
activity-1:05            288    object
bg+1:00                 3644   float64

[97 rows x 2 columns]


In [16]:
bg_cols = [col for col in dataset.columns if col.startswith('bg-')] 
insulin_cols = [col for col in dataset.columns if col.startswith('insulin-')] 
carbs_cols = [col for col in dataset.columns if col.startswith('carbs-')] 
hr_cols = [col for col in dataset.columns if col.startswith('hr-')] 
steps_cols = [col for col in dataset.columns if col.startswith('steps-')] 
cals_cols = [col for col in dataset.columns if col.startswith('cals-')] 
activity_cols = [col for col in dataset.columns if col.startswith("activity-")] 

# Use Forward/Backward Fill 
dataset[bg_cols] = dataset[bg_cols].fillna(method='ffill').fillna(method='bfill') 
dataset[activity_cols] = dataset[activity_cols].fillna(method='ffill').fillna(method='bfill')
# Fill with zeroes
dataset[insulin_cols] = dataset[insulin_cols].fillna(0) 
dataset[carbs_cols] = dataset[carbs_cols].fillna(0)
dataset[steps_cols] = dataset[steps_cols].fillna(0)
# Fill with Median
dataset[hr_cols] = dataset[hr_cols].fillna(dataset[hr_cols].median())
# Interpolate missing values for more precision 
dataset[cals_cols] = dataset[cals_cols].interpolate(method='linear', axis=0)

In [20]:
# Check for missing values and data types
missing_summary = dataset.isnull().sum().to_frame(name="Missing Count")
missing_summary["Data Type"] = dataset.dtypes
missing_summary = missing_summary[missing_summary["Missing Count"] > 0]

print(missing_summary) 
# only bg+1:00 is the missing column (for prediction) 

         Missing Count Data Type
bg+1:00           3644   float64


In [22]:
# Save to combined_test_batch_imputed.csv 
dataset.to_csv('combined_test_batch_imputed.csv', index=False) 

### Separate the Numeric and Categorical Columns 
    - Numeric Columns: float and int types
        - Ex: bg, insulin, steps, hr, carbs, cals
    - Categorical Columns: object  
        - Ex: id, p_num, time, activity

In [27]:
numeric_cols = dataset.select_dtypes(include=['float64', 'int64']).columns 
categorical_cols = dataset.select_dtypes(include=['object']).columns 

# Display numeric_cols and categorical_cols 
numeric_cols, categorical_cols

(Index(['bg-5:55', 'bg-5:50', 'bg-5:45', 'bg-5:40', 'bg-5:35', 'bg-5:30',
        'bg-5:25', 'bg-5:20', 'bg-5:15', 'bg-5:10',
        ...
        'cals-0:40', 'cals-0:35', 'cals-0:30', 'cals-0:25', 'cals-0:20',
        'cals-0:15', 'cals-0:10', 'cals-0:05', 'cals-0:00', 'bg+1:00'],
       dtype='object', length=433),
 Index(['id', 'p_num', 'time', 'activity-5:55', 'activity-5:50',
        'activity-5:45', 'activity-5:40', 'activity-5:35', 'activity-5:30',
        'activity-5:25', 'activity-5:20', 'activity-5:15', 'activity-5:10',
        'activity-5:05', 'activity-5:00', 'activity-4:55', 'activity-4:50',
        'activity-4:45', 'activity-4:40', 'activity-4:35', 'activity-4:30',
        'activity-4:25', 'activity-4:20', 'activity-4:15', 'activity-4:10',
        'activity-4:05', 'activity-4:00', 'activity-3:55', 'activity-3:50',
        'activity-3:45', 'activity-3:40', 'activity-3:35', 'activity-3:30',
        'activity-3:25', 'activity-3:20', 'activity-3:15', 'activity-3:10',
        

## Normalize the Numeric Columns 
### For Blood Glucose (bg), normalization should be separate between the input and output

In [30]:
from sklearn.preprocessing import MinMaxScaler 

In [36]:
# Initialize scalers for input 
input_scaler = MinMaxScaler() 

# Group each numeric columns  
bg_cols = [col for col in numeric_cols if col.startswith('bg-')] 
insulin_cols = [col for col in numeric_cols if col.startswith('insulin-')] 
carbs_cols = [col for col in numeric_cols if col.startswith('carbs-')] 
hr_cols = [col for col in numeric_cols if col.startswith('hr-')] 
steps_cols = [col for col in numeric_cols if col.startswith('steps-')] 
cals_cols = [col for col in numeric_cols if col.startswith('cals-')] 

# Normalize each group independently 
dataset[bg_cols] = input_scaler.fit_transform(dataset[bg_cols]) 
dataset[insulin_cols] = input_scaler.fit_transform(dataset[insulin_cols])
dataset[carbs_cols] = input_scaler.fit_transform(dataset[carbs_cols])
dataset[hr_cols] = input_scaler.fit_transform(dataset[hr_cols])
dataset[steps_cols] = input_scaler.fit_transform(dataset[steps_cols])
dataset[cals_cols] = input_scaler.fit_transform(dataset[cals_cols])

# Verify normalization by checking summary
bg_summary = dataset[bg_cols].describe() 
insulin_summary = dataset[insulin_cols].describe() 
carbs_summary = dataset[carbs_cols].describe() 
hr_summary = dataset[hr_cols].describe() 
steps_summary = dataset[steps_cols].describe() 
cals_summary = dataset[cals_cols].describe()

# Display the summaries
print("BG Columns Summary:")
print(bg_summary)
print("\nInsulin Columns Summary:")
print(insulin_summary)
# Display the summaries
print("\nCarbs Columns Summary:")
print(carbs_summary)
print("\nHR Columns Summary:")
print(hr_summary)
# Display the summaries
print("\nSteps Columns Summary:")
print(steps_summary)
print("\nCals Columns Summary:")
print(cals_summary)

BG Columns Summary:
           bg-5:55      bg-5:50      bg-5:45      bg-5:40      bg-5:35  \
count  3644.000000  3644.000000  3644.000000  3644.000000  3644.000000   
mean      0.281688     0.259856     0.249435     0.301185     0.253303   
std       0.147870     0.130223     0.132851     0.142268     0.130123   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.173709     0.164062     0.156000     0.196347     0.158793   
50%       0.248826     0.231120     0.220000     0.269406     0.220472   
75%       0.366197     0.330729     0.324000     0.378995     0.322835   
max       1.000000     1.000000     1.000000     1.000000     1.000000   

           bg-5:30      bg-5:25      bg-5:20      bg-5:15      bg-5:10  ...  \
count  3644.000000  3644.000000  3644.000000  3644.000000  3644.000000  ...   
mean      0.254389     0.309068     0.244915     0.246901     0.324668  ...   
std       0.132062     0.158961     0.130416     0.128683     0.150983  ... 

## Normalization Categorical Columns
### Encode Activity Columns

In [39]:
# Define the activity list
all_activities = [
    "Indoor climbing", "Run", "Strength training", "Swim", "Bike",
    "Dancing", "Stairclimber", "Spinning", "Walking", "HIIT",
    "Outdoor Bike", "Walk", "Aerobic Workout", "Tennis", "Workout",
    "Hike", "Zumba", "Sport", "Yoga", "Swimming", "Weights", "Running"
]

# Initialize a global LabelEncoder for all activities 
activity_encoder = LabelEncoder() 
activity_encoder.fit(all_activities) 

# Identify activity columns 
activity_cols = [col for col in dataset.columns if col.startswith("activity-")] 

# Encode all activity columns 
for col in activity_cols: 
    dataset[col] = activity_encoder.transform(dataset[col]) 

# Verify the encoding 
print(dataset[activity_cols].head())

   activity-5:55  activity-5:50  activity-5:45  activity-5:40  activity-5:35  \
0              8              8              2              2              2   
1              8              8              2              2              2   
2              2              8              2              2              2   
3              2              8              2              2              2   
4              8              8              2              2              2   

   activity-5:30  activity-5:25  activity-5:20  activity-5:15  activity-5:10  \
0              2              2              2              2              2   
1              2              2              2              2              2   
2              2              2              2              2              2   
3              2              2              2              2              2   
4              2              2              2              2              2   

   ...  activity-0:45  activity-0:40  

### Encode id, p_num, and time

In [52]:
output_cols = pd.concat([dataset['id'], dataset['bg+1:00']], axis=1)
output_cols.head()

Unnamed: 0,id,bg+1:00
0,p01_8459,
1,p01_8460,
2,p01_8461,
3,p01_8462,
4,p01_8463,


In [54]:
# Encode p_num 
dataset['p_num'] = LabelEncoder().fit_transform(dataset['p_num'])  

# Drop id  
dataset = dataset.drop(columns=['id']) 

# Extract hour and minute from time (HH:MM:SS) 
dataset['hour'] = dataset['time'].str.split(':').str[0].astype(int) 
dataset['minute'] = dataset['time'].str.split(':').str[1].astype(int) 

# Use cyclical encoding for time to be more efficient 
dataset['hour_sin'] = np.sin(2 * np.pi * dataset['hour'] / 24) 
dataset['hour_cos'] = np.cos(2 * np.pi * dataset['hour'] / 24) 
dataset['minute_sin'] = np.sin(2 * np.pi * dataset['minute'] / 60)
dataset['minute_cos'] = np.cos(2 * np.pi * dataset['minute'] / 60) 

# Drop the original time column 
dataset = dataset.drop(columns=['time']) 

  dataset['hour'] = dataset['time'].str.split(':').str[0].astype(int)
  dataset['minute'] = dataset['time'].str.split(':').str[1].astype(int)
  dataset['hour_sin'] = np.sin(2 * np.pi * dataset['hour'] / 24)
  dataset['hour_cos'] = np.cos(2 * np.pi * dataset['hour'] / 24)
  dataset['minute_sin'] = np.sin(2 * np.pi * dataset['minute'] / 60)
  dataset['minute_cos'] = np.cos(2 * np.pi * dataset['minute'] / 60)


In [56]:
# Re-arrange the time columns to the middle of p_num and bg-5:55 

# List of columns to insert
time_columns = ['hour', 'minute', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos']

# Get all columns up to and including p_num 
columns_before = list(dataset.columns[:dataset.columns.get_loc('p_num') + 1]) 

# Get all columns starting from bg-5:55 
columns_after = list(dataset.columns[dataset.columns.get_loc('bg-5:55'):dataset.columns.get_loc('bg+1:00')+1]) 

# Combine into the new column order 
new_column_order = columns_before + time_columns + columns_after 

# Rearrange the Dataframe 
dataset = dataset[new_column_order] 

# Display the new column order
print(dataset.columns)

Index(['p_num', 'hour', 'minute', 'hour_sin', 'hour_cos', 'minute_sin',
       'minute_cos', 'bg-5:55', 'bg-5:50', 'bg-5:45',
       ...
       'activity-0:40', 'activity-0:35', 'activity-0:30', 'activity-0:25',
       'activity-0:20', 'activity-0:15', 'activity-0:10', 'activity-0:05',
       'activity-0:00', 'bg+1:00'],
      dtype='object', length=512)


In [58]:
# Save as a new csv file 
output_file = 'pre_process_testing_dataset.csv'
dataset.to_csv(output_file, index=False) 