In [1]:
import pandas as pd 
from typing import Callable
import numpy as np
from scipy.fft import fft
from category_encoders import OneHotEncoder

In [11]:
class FeatureEngine:
    def __init__(self,):
        self.training_set = pd.read_csv("cleaned_data/train_data.csv")
        self.validation_set = pd.read_csv("cleaned_data/validation_data.csv")
        self.test_set = pd.read_csv("cleaned_data/test_data.csv")
        self.demographic_set = pd.read_csv("Data Tables/HScreening.txt", delimiter = '|')
        #self.time_series = pd.read_csv("Experimental_Notebooks/resampled_day.csv")
        #self.aggregate_window(self.tar, "tar")
        #self.aggregate_window(self.tbr, "tbr")
        #self.aggregate_window(self.rolling_mean, "mean")
        #self.aggregate_window(self.rolling_deviation, "std")
        #self.aggregate_window(self.tir, "tir")
        self.aggregate_window(self.fft, "fft")
        #åself.add_demographics()


    
    def fft(self, arr:np.ndarray) -> np.ndarray:
        return fft(arr)
    
    def rolling_deviation(self, arr:np.ndarray) -> np.ndarray:
        return np.std(arr)
    
    def tar(self, arr: np.ndarray) -> np.ndarray:
        mask = arr > 180
        return np.mean(mask)
    
    def tir(self, arr: np.ndarray) -> np.ndarray:
        mask = (arr >= 70) & (arr <= 180)
        return np.mean(mask)
    
    def tbr(self, arr: np.ndarray) -> np.ndarray:
        mask = arr < 70
        return np.mean(mask)
    
    def rolling_mean(self, arr: np.ndarray) -> np.ndarray:
        return np.mean(arr)

    def add_demographics(self) -> None:
        """
        Merge selected demographic features into the training, validation, and test sets.
        Encode them using one-hot encoding.
        """
        # Columns to merge
        columns_to_merge = [
            'PtID', 'Gender', 'Ethnicity', 'Race', 'SHMostRec', 'SHNumLast12Mon', 'DKAMostRec', 'DKANumLast12Mon',
            'OthGlucLowerMed', 'Weight', 'Height', 'PEAbnormal'
        ]

        # Filter the demographic set to include only the necessary columns
        demographics = self.demographic_set[columns_to_merge]

        # Prepare the encoder
        encoder = OneHotEncoder(cols=['Gender', 'Ethnicity', 'Race', 'SHMostRec', 'SHNumLast12Mon', 'DKAMostRec', 'DKANumLast12Mon', 'OthGlucLowerMed', 'PEAbnormal'], use_cat_names=True)

        # Fit and transform the encoder on the demographic data
        demographics_encoded = encoder.fit_transform(demographics)

        # Merge the encoded demographic data with each dataset
        self.training_set = self.training_set.merge(demographics_encoded, how='left', left_on='id', right_on='PtID')
        self.validation_set = self.validation_set.merge(demographics_encoded, how='left', left_on='id', right_on='PtID')
        self.test_set = self.test_set.merge(demographics_encoded, how='left', left_on='id', right_on='PtID')

    def aggregate_window(self, func: Callable, func_name: str) -> None:
        """
        Apply func over various window sizes and add columns to training and validation sets for the output of these aggregate functions
        
        Args:
        func (Callable): A function to apply to each aggregate window.
        func_name (str): The name of the function, used to create new column names.
        """
        # Define the time intervals in minutes for aggregation
        time_windows = {
            'last_10_minutes': 10,
            'last_30_minutes': 30,
            'last_1_hour': 60,
            'last_3_hours': 180,
            'last_6_hours': 360,
            'last_12_hours': 720
        }

        # Convert the column names to a format we can perform calculations on (number of minutes since 06:00:00)

        stamps = self.training_set.columns[6:222]
        
        times = pd.to_timedelta(stamps).total_seconds()/60  # Convert to minutes

        # Apply the aggregation function over specified time windows
        for window_name, minutes in time_windows.items():
            # Find the time range for each window
            max_time = times.max()
            min_time = max_time - minutes

            # Get columns that fall within the current time window
            columns_to_aggregate = [time for time in stamps if min_time < pd.to_timedelta(time).total_seconds()/60 <= max_time]

            if func == self.fft:
                # Process FFT and store results in new DataFrames
                fft_data_train = {}
                fft_data_valid = {}
                fft_data_test = {}

                fft_results_train = np.apply_along_axis(func, 1, self.training_set[columns_to_aggregate].values)
                fft_results_valid = np.apply_along_axis(func, 1, self.validation_set[columns_to_aggregate].values)
                fft_results_test = np.apply_along_axis(func, 1, self.test_set[columns_to_aggregate].values)

                for i in range(fft_results_train.shape[1]):
                    fft_data_train[f'{func_name}_{window_name}_real_{i}'] = fft_results_train[:, i].real
                    fft_data_train[f'{func_name}_{window_name}_imag_{i}'] = fft_results_train[:, i].imag
                    fft_data_valid[f'{func_name}_{window_name}_real_{i}'] = fft_results_valid[:, i].real
                    fft_data_valid[f'{func_name}_{window_name}_imag_{i}'] = fft_results_valid[:, i].imag
                    fft_data_test[f'{func_name}_{window_name}_real_{i}'] = fft_results_test[:, i].real
                    fft_data_test[f'{func_name}_{window_name}_imag_{i}'] = fft_results_test[:, i].imag
                
                # Convert dictionary to DataFrame and concatenate
                new_train_df = pd.DataFrame(fft_data_train)
                new_valid_df = pd.DataFrame(fft_data_valid)
                new_test_df = pd.DataFrame(fft_data_test)

                self.training_set = pd.concat([self.training_set, new_train_df], axis=1)
                self.validation_set = pd.concat([self.validation_set, new_valid_df], axis=1)
                self.test_set = pd.concat([self.test_set, new_test_df], axis=1)
                
            else:
                # Apply the function to the selected columns and store in a new column
                self.training_set[f'{func_name}_{window_name}'] = self.training_set[columns_to_aggregate].apply(func, axis=1)
                self.validation_set[f'{func_name}_{window_name}'] = self.validation_set[columns_to_aggregate].apply(func, axis=1)
                self.test_set[f'{func_name}_{window_name}'] = self.test_set[columns_to_aggregate].apply(func, axis=1)

In [12]:
test_obj = FeatureEngine()

In [13]:
test_obj.test_set.head()

Unnamed: 0.1,Unnamed: 0,index,id,corresponding_day,06:00:00,06:05:00,06:10:00,06:15:00,06:20:00,06:25:00,...,fft_last_12_hours_real_139,fft_last_12_hours_imag_139,fft_last_12_hours_real_140,fft_last_12_hours_imag_140,fft_last_12_hours_real_141,fft_last_12_hours_imag_141,fft_last_12_hours_real_142,fft_last_12_hours_imag_142,fft_last_12_hours_real_143,fft_last_12_hours_imag_143
0,1514,1514,15,2015-04-23,125.0,134.0,131.0,142.0,157.0,169.0,...,-375.498232,-187.338107,639.343081,-623.597227,-801.358323,246.066452,-4223.991086,-843.723887,2128.451672,-806.877055
1,1515,1515,15,2015-04-24,154.0,156.0,159.0,158.0,161.0,143.0,...,-243.474056,-121.43949,-61.100857,-540.02125,310.359535,676.772014,-388.507416,-144.056005,-965.002148,-4700.062468
2,1516,1516,15,2015-04-25,115.0,108.0,96.0,86.0,82.0,87.0,...,213.472023,-362.10533,356.415422,-957.061897,-154.112694,1232.506617,2588.701494,-114.455035,-2129.420318,-1240.817007
3,1517,1517,15,2015-04-26,227.0,228.0,229.0,231.0,240.0,244.0,...,-962.852296,-1384.676562,392.963017,763.99043,-3625.241781,-962.510805,1392.138415,-6154.000006,1149.464359,-1165.271128
4,1518,1518,15,2015-04-27,172.0,181.0,178.0,172.0,189.0,195.0,...,292.672651,231.608371,41.076141,1285.873055,-206.01962,2473.914536,-2482.348817,-413.338221,2418.116259,-691.407484


In [92]:
"""
TODO:
2. Add Morlet Mexican Hat Columns
"""

''