In [121]:
import pandas as pd 
from typing import Callable
import numpy as np
from scipy.fft import fft
from category_encoders import OneHotEncoder

In [120]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.2-cp39-cp39-macosx_10_9_x86_64.whl.metadata (9.2 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m862.9 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hDownloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.9/233.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading statsmodels-0.14.2-cp39-cp39-macosx_10_9_x86_64.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collect

In [130]:
class FeatureEngine:
    def __init__(self,):
        self.training_set = pd.read_csv("cleaned_data/train_data.csv")
        self.validation_set = pd.read_csv("cleaned_data/validation_data.csv")
        self.test_set = pd.read_csv("cleaned_data/test_data.csv")
        self.demographic_set = pd.read_csv("Data Tables/HScreening.txt", delimiter = '|')
        #self.time_series = pd.read_csv("Experimental_Notebooks/resampled_day.csv")
        #self.aggregate_window(self.tar, "tar")
        #self.aggregate_window(self.tbr, "tbr")
        #self.aggregate_window(self.rolling_mean, "mean")
        #self.aggregate_window(self.rolling_deviation, "std")
        #self.aggregate_window(self.tir, "tir")
        #self.aggregate_window(self.fft, "fft")
        self.add_demographics()



    def add_demographics(self) -> None:
        """
        Merge selected demographic features into the training, validation, and test sets.
        Encode them as caterogical integer values
        """
        # Columns to merge
        columns_to_merge = [
            'PtID', 'Gender', 'Ethnicity', 'Race', 'SHMostRec', 'SHNumLast12Mon', 
            'OthGlucLowerMed', 'Weight', 'Height', 'PEAbnormal'
        ]
        
        # Filter the demographic set to include only the necessary columns
        demographics = self.demographic_set[columns_to_merge]

        # Merge the demographic data with each dataset
        self.training_set = self.training_set.merge(demographics, how='left', left_on='id', right_on='PtID')
        self.validation_set = self.validation_set.merge(demographics, how='left', left_on='id', right_on='PtID')
        self.test_set = self.test_set.merge(demographics, how='left', left_on='id', right_on='PtID')
    
    def fft(self, arr:np.ndarray) -> np.ndarray:
        return fft(arr)
    
    def rolling_deviation(self, arr:np.ndarray) -> np.ndarray:
        return np.std(arr)
    
    def tar(self, arr: np.ndarray) -> np.ndarray:
        mask = arr > 180
        return np.mean(mask)
    
    def tir(self, arr: np.ndarray) -> np.ndarray:
        mask = (arr >= 70) & (arr <= 180)
        return np.mean(mask)
    
    def tbr(self, arr: np.ndarray) -> np.ndarray:
        mask = arr < 70
        return np.mean(mask)
    
    def rolling_mean(self, arr: np.ndarray) -> np.ndarray:
        return np.mean(arr)

    def add_demographics(self) -> None:
        """
        Merge selected demographic features into the training, validation, and test sets.
        Encode them using one-hot encoding.
        """
        # Columns to merge
        columns_to_merge = [
            'PtID', 'Gender', 'Ethnicity', 'Race', 'SHMostRec', 'SHNumLast12Mon', 'DKAMostRec', 'DKANumLast12Mon',
            'OthGlucLowerMed', 'Weight', 'Height', 'PEAbnormal'
        ]

        # Filter the demographic set to include only the necessary columns
        demographics = self.demographic_set[columns_to_merge]

        # Prepare the encoder
        encoder = OneHotEncoder(cols=['Gender', 'Ethnicity', 'Race', 'SHMostRec', 'SHNumLast12Mon', 'DKAMostRec', 'DKANumLast12Mon', 'OthGlucLowerMed', 'PEAbnormal'], use_cat_names=True)

        # Fit and transform the encoder on the demographic data
        demographics_encoded = encoder.fit_transform(demographics)

        # Merge the encoded demographic data with each dataset
        self.training_set = self.training_set.merge(demographics_encoded, how='left', left_on='id', right_on='PtID')
        self.validation_set = self.validation_set.merge(demographics_encoded, how='left', left_on='id', right_on='PtID')
        self.test_set = self.test_set.merge(demographics_encoded, how='left', left_on='id', right_on='PtID')



In [131]:
test = FeatureEngine()

  self.training_set = pd.read_csv("cleaned_data/train_data.csv")


In [138]:
test.validation_set.head()

Unnamed: 0.1,Unnamed: 0,index,id,corresponding_day,06:00:00,06:05:00,06:10:00,06:15:00,06:20:00,06:25:00,...,DKAMostRec_More than 12 months ago,DKAMostRec_6-12 months ago,DKANumLast12Mon_0.0,DKANumLast12Mon_1.0,OthGlucLowerMed_No,OthGlucLowerMed_Yes,Weight,Height,PEAbnormal_No,PEAbnormal_Yes
0,193,193,3,2015-05-23,163.0,159.0,155.0,152.0,147.0,141.0,...,1,0,1,0,1,0,65.2,173.0,1,0
1,194,194,3,2015-05-24,214.0,215.0,221.0,231.0,241.0,252.0,...,1,0,1,0,1,0,65.2,173.0,1,0
2,195,195,3,2015-05-25,115.0,114.0,116.0,118.0,121.0,122.0,...,1,0,1,0,1,0,65.2,173.0,1,0
3,196,196,3,2015-05-26,206.0,207.0,207.0,209.0,211.0,216.75,...,1,0,1,0,1,0,65.2,173.0,1,0
4,197,197,3,2015-05-27,169.0,171.0,173.0,173.0,174.0,182.5,...,1,0,1,0,1,0,65.2,173.0,1,0


In [92]:
"""
TODO:
2. Add Morlet Mexican Hat Columns
"""

''